NN-512

Back

Index

Files

Top || Input graph file

Config Prefix=ResNeXt50 Platform=AVX512Float32 L1DataCachePerThread=32KiB L2CachePerThreadExL1=960KiB L3CachePerThreadExL1L2=1408KiB
Input ToTensor=image Channels=3 Height=224 Width=224
BatchNorm FromTensor=image ToTensor=bn1 Epsilon=0.00002
Conv FromTensor=bn1 ToTensor=sevenDS ToChannels=64 FilterH=7 FilterW=7 StrideH=2 StrideW=2 PaddingH=3 PaddingW=3 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=sevenDS ToTensor=bn2 Epsilon=0.00002
Activation FromTensor=bn2 ToTensor=relu1 Kind=ReLU Param=0
Pooling FromTensor=relu1 ToTensor=pool1 Kind=Max3x3Stride2 PaddingH=1 PaddingW=1
Conv FromTensor=pool1 ToTensor=one1 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one1 ToTensor=bn3 Epsilon=0.00002
Conv FromTensor=pool1 ToTensor=one2 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one2 ToTensor=bn4 Epsilon=0.00002
Activation FromTensor=bn4 ToTensor=relu2 Kind=ReLU Param=0
Conv FromTensor=relu2 ToTensor=three1 ToChannels=128 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three1 ToTensor=bn5 Epsilon=0.00002
Activation FromTensor=bn5 ToTensor=relu3 Kind=ReLU Param=0
Conv FromTensor=relu3 ToTensor=one3 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one3 ToTensor=bn6 Epsilon=0.00002
Add FromTensor1=bn3 FromTensor2=bn6 ToTensor=add1
Activation FromTensor=add1 ToTensor=relu4 Kind=ReLU Param=0
Conv FromTensor=relu4 ToTensor=one4 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one4 ToTensor=bn7 Epsilon=0.00002
Activation FromTensor=bn7 ToTensor=relu5 Kind=ReLU Param=0
Conv FromTensor=relu5 ToTensor=three2 ToChannels=128 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three2 ToTensor=bn8 Epsilon=0.00002
Activation FromTensor=bn8 ToTensor=relu6 Kind=ReLU Param=0
Conv FromTensor=relu6 ToTensor=one5 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one5 ToTensor=bn9 Epsilon=0.00002
Add FromTensor1=relu4 FromTensor2=bn9 ToTensor=add2
Activation FromTensor=add2 ToTensor=relu7 Kind=ReLU Param=0
Conv FromTensor=relu7 ToTensor=one6 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one6 ToTensor=bn10 Epsilon=0.00002
Activation FromTensor=bn10 ToTensor=relu8 Kind=ReLU Param=0
Conv FromTensor=relu8 ToTensor=three3 ToChannels=128 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three3 ToTensor=bn11 Epsilon=0.00002
Activation FromTensor=bn11 ToTensor=relu9 Kind=ReLU Param=0
Conv FromTensor=relu9 ToTensor=one7 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one7 ToTensor=bn12 Epsilon=0.00002
Add FromTensor1=relu7 FromTensor2=bn12 ToTensor=add3
Activation FromTensor=add3 ToTensor=relu10 Kind=ReLU Param=0
Conv FromTensor=relu10 ToTensor=oneDS1 ToChannels=512 FilterH=1 FilterW=1 StrideH=2 StrideW=2 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=oneDS1 ToTensor=bn13 Epsilon=0.00002
Conv FromTensor=relu10 ToTensor=one8 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one8 ToTensor=bn14 Epsilon=0.00002
Activation FromTensor=bn14 ToTensor=relu11 Kind=ReLU Param=0
Conv FromTensor=relu11 ToTensor=threeDS1 ToChannels=256 FilterH=3 FilterW=3 StrideH=2 StrideW=2 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=threeDS1 ToTensor=bn15 Epsilon=0.00002
Activation FromTensor=bn15 ToTensor=relu12 Kind=ReLU Param=0
Conv FromTensor=relu12 ToTensor=one9 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one9 ToTensor=bn16 Epsilon=0.00002
Add FromTensor1=bn13 FromTensor2=bn16 ToTensor=add4
Activation FromTensor=add4 ToTensor=relu13 Kind=ReLU Param=0
Conv FromTensor=relu13 ToTensor=one10 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one10 ToTensor=bn17 Epsilon=0.00002
Activation FromTensor=bn17 ToTensor=relu14 Kind=ReLU Param=0
Conv FromTensor=relu14 ToTensor=three4 ToChannels=256 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three4 ToTensor=bn18 Epsilon=0.00002
Activation FromTensor=bn18 ToTensor=relu15 Kind=ReLU Param=0
Conv FromTensor=relu15 ToTensor=one11 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one11 ToTensor=bn19 Epsilon=0.00002
Add FromTensor1=relu13 FromTensor2=bn19 ToTensor=add5
Activation FromTensor=add5 ToTensor=relu16 Kind=ReLU Param=0
Conv FromTensor=relu16 ToTensor=one12 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one12 ToTensor=bn20 Epsilon=0.00002
Activation FromTensor=bn20 ToTensor=relu17 Kind=ReLU Param=0
Conv FromTensor=relu17 ToTensor=three5 ToChannels=256 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three5 ToTensor=bn21 Epsilon=0.00002
Activation FromTensor=bn21 ToTensor=relu18 Kind=ReLU Param=0
Conv FromTensor=relu18 ToTensor=one13 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one13 ToTensor=bn22 Epsilon=0.00002
Add FromTensor1=relu16 FromTensor2=bn22 ToTensor=add6
Activation FromTensor=add6 ToTensor=relu19 Kind=ReLU Param=0
Conv FromTensor=relu19 ToTensor=one14 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one14 ToTensor=bn23 Epsilon=0.00002
Activation FromTensor=bn23 ToTensor=relu20 Kind=ReLU Param=0
Conv FromTensor=relu20 ToTensor=three6 ToChannels=256 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three6 ToTensor=bn24 Epsilon=0.00002
Activation FromTensor=bn24 ToTensor=relu21 Kind=ReLU Param=0
Conv FromTensor=relu21 ToTensor=one15 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one15 ToTensor=bn25 Epsilon=0.00002
Add FromTensor1=relu19 FromTensor2=bn25 ToTensor=add7
Activation FromTensor=add7 ToTensor=relu22 Kind=ReLU Param=0
Conv FromTensor=relu22 ToTensor=oneDS2 ToChannels=1024 FilterH=1 FilterW=1 StrideH=2 StrideW=2 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=oneDS2 ToTensor=bn26 Epsilon=0.00002
Conv FromTensor=relu22 ToTensor=one16 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one16 ToTensor=bn27 Epsilon=0.00002
Activation FromTensor=bn27 ToTensor=relu23 Kind=ReLU Param=0
Conv FromTensor=relu23 ToTensor=threeDS2 ToChannels=512 FilterH=3 FilterW=3 StrideH=2 StrideW=2 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=threeDS2 ToTensor=bn28 Epsilon=0.00002
Activation FromTensor=bn28 ToTensor=relu24 Kind=ReLU Param=0
Conv FromTensor=relu24 ToTensor=one17 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one17 ToTensor=bn29 Epsilon=0.00002
Add FromTensor1=bn26 FromTensor2=bn29 ToTensor=add8
Activation FromTensor=add8 ToTensor=relu25 Kind=ReLU Param=0
Conv FromTensor=relu25 ToTensor=one18 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one18 ToTensor=bn30 Epsilon=0.00002
Activation FromTensor=bn30 ToTensor=relu26 Kind=ReLU Param=0
Conv FromTensor=relu26 ToTensor=three7 ToChannels=512 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three7 ToTensor=bn31 Epsilon=0.00002
Activation FromTensor=bn31 ToTensor=relu27 Kind=ReLU Param=0
Conv FromTensor=relu27 ToTensor=one19 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one19 ToTensor=bn32 Epsilon=0.00002
Add FromTensor1=relu25 FromTensor2=bn32 ToTensor=add9
Activation FromTensor=add9 ToTensor=relu28 Kind=ReLU Param=0
Conv FromTensor=relu28 ToTensor=one20 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one20 ToTensor=bn33 Epsilon=0.00002
Activation FromTensor=bn33 ToTensor=relu29 Kind=ReLU Param=0
Conv FromTensor=relu29 ToTensor=three8 ToChannels=512 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three8 ToTensor=bn34 Epsilon=0.00002
Activation FromTensor=bn34 ToTensor=relu30 Kind=ReLU Param=0
Conv FromTensor=relu30 ToTensor=one21 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one21 ToTensor=bn35 Epsilon=0.00002
Add FromTensor1=relu28 FromTensor2=bn35 ToTensor=add10
Activation FromTensor=add10 ToTensor=relu31 Kind=ReLU Param=0
Conv FromTensor=relu31 ToTensor=one22 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one22 ToTensor=bn36 Epsilon=0.00002
Activation FromTensor=bn36 ToTensor=relu32 Kind=ReLU Param=0
Conv FromTensor=relu32 ToTensor=three9 ToChannels=512 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three9 ToTensor=bn37 Epsilon=0.00002
Activation FromTensor=bn37 ToTensor=relu33 Kind=ReLU Param=0
Conv FromTensor=relu33 ToTensor=one23 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one23 ToTensor=bn38 Epsilon=0.00002
Add FromTensor1=relu31 FromTensor2=bn38 ToTensor=add11
Activation FromTensor=add11 ToTensor=relu34 Kind=ReLU Param=0
Conv FromTensor=relu34 ToTensor=one24 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one24 ToTensor=bn39 Epsilon=0.00002
Activation FromTensor=bn39 ToTensor=relu35 Kind=ReLU Param=0
Conv FromTensor=relu35 ToTensor=three10 ToChannels=512 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three10 ToTensor=bn40 Epsilon=0.00002
Activation FromTensor=bn40 ToTensor=relu36 Kind=ReLU Param=0
Conv FromTensor=relu36 ToTensor=one25 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one25 ToTensor=bn41 Epsilon=0.00002
Add FromTensor1=relu34 FromTensor2=bn41 ToTensor=add12
Activation FromTensor=add12 ToTensor=relu37 Kind=ReLU Param=0
Conv FromTensor=relu37 ToTensor=one26 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one26 ToTensor=bn42 Epsilon=0.00002
Activation FromTensor=bn42 ToTensor=relu38 Kind=ReLU Param=0
Conv FromTensor=relu38 ToTensor=three11 ToChannels=512 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three11 ToTensor=bn43 Epsilon=0.00002
Activation FromTensor=bn43 ToTensor=relu39 Kind=ReLU Param=0
Conv FromTensor=relu39 ToTensor=one27 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one27 ToTensor=bn44 Epsilon=0.00002
Add FromTensor1=relu37 FromTensor2=bn44 ToTensor=add13
Activation FromTensor=add13 ToTensor=relu40 Kind=ReLU Param=0
Conv FromTensor=relu40 ToTensor=oneDS3 ToChannels=2048 FilterH=1 FilterW=1 StrideH=2 StrideW=2 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=oneDS3 ToTensor=bn45 Epsilon=0.00002
Conv FromTensor=relu40 ToTensor=one28 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one28 ToTensor=bn46 Epsilon=0.00002
Activation FromTensor=bn46 ToTensor=relu41 Kind=ReLU Param=0
Conv FromTensor=relu41 ToTensor=threeDS3 ToChannels=1024 FilterH=3 FilterW=3 StrideH=2 StrideW=2 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=threeDS3 ToTensor=bn47 Epsilon=0.00002
Activation FromTensor=bn47 ToTensor=relu42 Kind=ReLU Param=0
Conv FromTensor=relu42 ToTensor=one29 ToChannels=2048 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one29 ToTensor=bn48 Epsilon=0.00002
Add FromTensor1=bn45 FromTensor2=bn48 ToTensor=add14
Activation FromTensor=add14 ToTensor=relu43 Kind=ReLU Param=0
Conv FromTensor=relu43 ToTensor=one30 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one30 ToTensor=bn49 Epsilon=0.00002
Activation FromTensor=bn49 ToTensor=relu44 Kind=ReLU Param=0
Conv FromTensor=relu44 ToTensor=three12 ToChannels=1024 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three12 ToTensor=bn50 Epsilon=0.00002
Activation FromTensor=bn50 ToTensor=relu45 Kind=ReLU Param=0
Conv FromTensor=relu45 ToTensor=one31 ToChannels=2048 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one31 ToTensor=bn51 Epsilon=0.00002
Add FromTensor1=relu43 FromTensor2=bn51 ToTensor=add15
Activation FromTensor=add15 ToTensor=relu46 Kind=ReLU Param=0
Conv FromTensor=relu46 ToTensor=one32 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one32 ToTensor=bn52 Epsilon=0.00002
Activation FromTensor=bn52 ToTensor=relu47 Kind=ReLU Param=0
Conv FromTensor=relu47 ToTensor=three13 ToChannels=1024 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three13 ToTensor=bn53 Epsilon=0.00002
Activation FromTensor=bn53 ToTensor=relu48 Kind=ReLU Param=0
Conv FromTensor=relu48 ToTensor=one33 ToChannels=2048 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one33 ToTensor=bn54 Epsilon=0.00002
Add FromTensor1=relu46 FromTensor2=bn54 ToTensor=add16
Activation FromTensor=add16 ToTensor=relu49 Kind=ReLU Param=0
Pooling FromTensor=relu49 ToTensor=pool2 Kind=AvgGlobal PaddingH=0 PaddingW=0
FullyConnected FromTensor=pool2 ToTensor=fc ToChannels=1000
Softmax FromTensor=fc ToTensor=prob
Output FromTensor=prob

Top || Output ResNeXt50.h file

#pragma once

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <pthread.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" { /**/
#endif

// All weights, biases, and other trained parameters are passed into
// the initialization code through the Params struct that is declared
// just below this comment. The corresponding struct definition can be
// found near the end of this header file.
//
// Each field of the Params struct is an array of float that holds a
// parameter tensor in NCHW format with no padding. The struct fields
// are ordered by name, lexically bytewise. If you concatenate all the
// trained parameter tensors to a file in this same format and order
// you can load the struct as follows (error checking omitted here):
//
// size_t size = sizeof(ResNeXt50Params);
// ResNeXt50Params* to = malloc(size);
// FILE* from = fopen("ParamsFile", "r");
// fread(to, size, 1, from);
// fclose(from);
//
// Be careful to match endianness (and floating point format).

typedef struct ResNeXt50Params ResNeXt50Params;

// The Net contains weights, biases, and other trained parameters in a
// form that enables efficient inference. It is created from the input
// parameter struct without modifying that struct. The input parameter
// struct is no longer needed once the Net has been created. Threads
// that are used to create the Net are temporary (in particular, those
// threads are not used for inference).
//
// ResNeXt50Params* params = malloc(sizeof(ResNeXt50Params));
//
// ... Load params (read from a file, perhaps) ...
//
// ResNeXt50Net* net; // For example, 4 threads:
// char* err = ResNeXt50NetCreate(&net, params, 4);
// free(params);
//
// if (err) { // Nonzero err indicates failure; net is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Perform all inference that depends on net ...
//
// ResNeXt50NetDestroy(net);
//
// The Net can be shared and reused without restriction because it is
// never modified (not even temporarily) after being created. The Net
// should be destroyed (to free memory) once all dependent inference
// is complete.

typedef struct ResNeXt50Net ResNeXt50Net;

char* ResNeXt50NetCreate(
ResNeXt50Net**,
ResNeXt50Params*,
ptrdiff_t threads
);

void ResNeXt50NetDestroy(ResNeXt50Net*);

// An Engine performs inference. It contains inference threads, scratch
// memory, and a pointer to the Net. Any number of Engines can share the
// same Net (and perform inference in parallel) because the Net is never
// modified. For best performance the number of inference threads should
// not exceed the number of CPU cores.
//
// ResNeXt50Net* net;
//
// ... Create net ...
//
// ResNeXt50Engine* engine; // For example, 4 inference threads:
// char* err = ResNeXt50EngineCreate(&engine, net, 4);
//
// if (err) { // Nonzero err means failure; engine is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
//
// ... Destroy net ...
//
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Use the POSIX threads API to adjust engine's threads ...
// ... Use engine to perform inference (dependent on net) ...
//
// ResNeXt50EngineDestroy(engine); // Terminate threads, free memory.
//
// ... Destroy net ...
//
// The POSIX threads API can be used to adjust an Engine's threads. If
// an Engine has N threads, those threads are indexed 0, 1, 2, ..., N-1
// and a pthread_t identifier is associated with each index. To set the
// CPU affinity mask for the first inference thread, for example:
//
// pthread_t thread; // The first thread has index 0:
// char* err = ResNeXt50EnginePthreadT(engine, 0, &thread);
//
// assert(!err); // Can only fail if the thread index is invalid.
//
// pthread_setaffinity_np(thread, ...); // Details omitted.
//
// The inference function reads floats from (one or more) input tensors
// and writes floats to (one or more) output tensors. All the input and
// output tensors are owned (allocated and freed) by the caller and are
// in CHW format, 32-bit floating point, fully packed (in other words,
// C has the largest pitch, W has the smallest pitch, and there is no
// padding anywhere).
//
// float* imageData = malloc(sizeof(float)*3*224*224);
// float* probData = malloc(sizeof(float)*1000*1*1);
//
// for (...) { // Reuse the input and output tensors.
//
// ... Write the input floats ...
//
// ResNeXt50EngineInference( // This function cannot fail.
// engine, // Pass an Engine as the first argument.
// imageData, // The tensor arguments are sorted by name.
// probData
// );
//
// ... Read the output floats ...
//
// }
//
// free(imageData);
// free(probData);
//
// The tensor parameters of the inference function are ordered by name,
// lexically bytewise. In other words, the function parameters have been
// sorted by name using Go's "<" string comparison operator (a bytewise
// lexical string sort).

typedef struct ResNeXt50Engine ResNeXt50Engine;

char* ResNeXt50EngineCreate(
ResNeXt50Engine**,
ResNeXt50Net*,
ptrdiff_t threads
);

char* ResNeXt50EnginePthreadT(
ResNeXt50Engine*,
ptrdiff_t threadIdx,
pthread_t* to
);

void ResNeXt50EngineInference(
ResNeXt50Engine*,
float* imageData,
float* probData
);

void ResNeXt50EngineDestroy(ResNeXt50Engine*);

// The fields of the following struct have been sorted by name using
// Go's "<" string comparison operator (bytewise lexical string sort).
// Tensor dimensions are NxCxHxW where N is the outermost/slowest and
// W is the innermost/fastest. There is no padding anywhere.

struct ResNeXt50Params {
float bn10Means[128]; // 1x128x1x1
float bn10Scales[128]; // 1x128x1x1
float bn10Shifts[128]; // 1x128x1x1
float bn10Variances[128]; // 1x128x1x1
float bn11Means[128]; // 1x128x1x1
float bn11Scales[128]; // 1x128x1x1
float bn11Shifts[128]; // 1x128x1x1
float bn11Variances[128]; // 1x128x1x1
float bn12Means[256]; // 1x256x1x1
float bn12Scales[256]; // 1x256x1x1
float bn12Shifts[256]; // 1x256x1x1
float bn12Variances[256]; // 1x256x1x1
float bn13Means[512]; // 1x512x1x1
float bn13Scales[512]; // 1x512x1x1
float bn13Shifts[512]; // 1x512x1x1
float bn13Variances[512]; // 1x512x1x1
float bn14Means[256]; // 1x256x1x1
float bn14Scales[256]; // 1x256x1x1
float bn14Shifts[256]; // 1x256x1x1
float bn14Variances[256]; // 1x256x1x1
float bn15Means[256]; // 1x256x1x1
float bn15Scales[256]; // 1x256x1x1
float bn15Shifts[256]; // 1x256x1x1
float bn15Variances[256]; // 1x256x1x1
float bn16Means[512]; // 1x512x1x1
float bn16Scales[512]; // 1x512x1x1
float bn16Shifts[512]; // 1x512x1x1
float bn16Variances[512]; // 1x512x1x1
float bn17Means[256]; // 1x256x1x1
float bn17Scales[256]; // 1x256x1x1
float bn17Shifts[256]; // 1x256x1x1
float bn17Variances[256]; // 1x256x1x1
float bn18Means[256]; // 1x256x1x1
float bn18Scales[256]; // 1x256x1x1
float bn18Shifts[256]; // 1x256x1x1
float bn18Variances[256]; // 1x256x1x1
float bn19Means[512]; // 1x512x1x1
float bn19Scales[512]; // 1x512x1x1
float bn19Shifts[512]; // 1x512x1x1
float bn19Variances[512]; // 1x512x1x1
float bn1Means[3]; // 1x3x1x1
float bn1Scales[3]; // 1x3x1x1
float bn1Shifts[3]; // 1x3x1x1
float bn1Variances[3]; // 1x3x1x1
float bn20Means[256]; // 1x256x1x1
float bn20Scales[256]; // 1x256x1x1
float bn20Shifts[256]; // 1x256x1x1
float bn20Variances[256]; // 1x256x1x1
float bn21Means[256]; // 1x256x1x1
float bn21Scales[256]; // 1x256x1x1
float bn21Shifts[256]; // 1x256x1x1
float bn21Variances[256]; // 1x256x1x1
float bn22Means[512]; // 1x512x1x1
float bn22Scales[512]; // 1x512x1x1
float bn22Shifts[512]; // 1x512x1x1
float bn22Variances[512]; // 1x512x1x1
float bn23Means[256]; // 1x256x1x1
float bn23Scales[256]; // 1x256x1x1
float bn23Shifts[256]; // 1x256x1x1
float bn23Variances[256]; // 1x256x1x1
float bn24Means[256]; // 1x256x1x1
float bn24Scales[256]; // 1x256x1x1
float bn24Shifts[256]; // 1x256x1x1
float bn24Variances[256]; // 1x256x1x1
float bn25Means[512]; // 1x512x1x1
float bn25Scales[512]; // 1x512x1x1
float bn25Shifts[512]; // 1x512x1x1
float bn25Variances[512]; // 1x512x1x1
float bn26Means[1024]; // 1x1024x1x1
float bn26Scales[1024]; // 1x1024x1x1
float bn26Shifts[1024]; // 1x1024x1x1
float bn26Variances[1024]; // 1x1024x1x1
float bn27Means[512]; // 1x512x1x1
float bn27Scales[512]; // 1x512x1x1
float bn27Shifts[512]; // 1x512x1x1
float bn27Variances[512]; // 1x512x1x1
float bn28Means[512]; // 1x512x1x1
float bn28Scales[512]; // 1x512x1x1
float bn28Shifts[512]; // 1x512x1x1
float bn28Variances[512]; // 1x512x1x1
float bn29Means[1024]; // 1x1024x1x1
float bn29Scales[1024]; // 1x1024x1x1
float bn29Shifts[1024]; // 1x1024x1x1
float bn29Variances[1024]; // 1x1024x1x1
float bn2Means[64]; // 1x64x1x1
float bn2Scales[64]; // 1x64x1x1
float bn2Shifts[64]; // 1x64x1x1
float bn2Variances[64]; // 1x64x1x1
float bn30Means[512]; // 1x512x1x1
float bn30Scales[512]; // 1x512x1x1
float bn30Shifts[512]; // 1x512x1x1
float bn30Variances[512]; // 1x512x1x1
float bn31Means[512]; // 1x512x1x1
float bn31Scales[512]; // 1x512x1x1
float bn31Shifts[512]; // 1x512x1x1
float bn31Variances[512]; // 1x512x1x1
float bn32Means[1024]; // 1x1024x1x1
float bn32Scales[1024]; // 1x1024x1x1
float bn32Shifts[1024]; // 1x1024x1x1
float bn32Variances[1024]; // 1x1024x1x1
float bn33Means[512]; // 1x512x1x1
float bn33Scales[512]; // 1x512x1x1
float bn33Shifts[512]; // 1x512x1x1
float bn33Variances[512]; // 1x512x1x1
float bn34Means[512]; // 1x512x1x1
float bn34Scales[512]; // 1x512x1x1
float bn34Shifts[512]; // 1x512x1x1
float bn34Variances[512]; // 1x512x1x1
float bn35Means[1024]; // 1x1024x1x1
float bn35Scales[1024]; // 1x1024x1x1
float bn35Shifts[1024]; // 1x1024x1x1
float bn35Variances[1024]; // 1x1024x1x1
float bn36Means[512]; // 1x512x1x1
float bn36Scales[512]; // 1x512x1x1
float bn36Shifts[512]; // 1x512x1x1
float bn36Variances[512]; // 1x512x1x1
float bn37Means[512]; // 1x512x1x1
float bn37Scales[512]; // 1x512x1x1
float bn37Shifts[512]; // 1x512x1x1
float bn37Variances[512]; // 1x512x1x1
float bn38Means[1024]; // 1x1024x1x1
float bn38Scales[1024]; // 1x1024x1x1
float bn38Shifts[1024]; // 1x1024x1x1
float bn38Variances[1024]; // 1x1024x1x1
float bn39Means[512]; // 1x512x1x1
float bn39Scales[512]; // 1x512x1x1
float bn39Shifts[512]; // 1x512x1x1
float bn39Variances[512]; // 1x512x1x1
float bn3Means[256]; // 1x256x1x1
float bn3Scales[256]; // 1x256x1x1
float bn3Shifts[256]; // 1x256x1x1
float bn3Variances[256]; // 1x256x1x1
float bn40Means[512]; // 1x512x1x1
float bn40Scales[512]; // 1x512x1x1
float bn40Shifts[512]; // 1x512x1x1
float bn40Variances[512]; // 1x512x1x1
float bn41Means[1024]; // 1x1024x1x1
float bn41Scales[1024]; // 1x1024x1x1
float bn41Shifts[1024]; // 1x1024x1x1
float bn41Variances[1024]; // 1x1024x1x1
float bn42Means[512]; // 1x512x1x1
float bn42Scales[512]; // 1x512x1x1
float bn42Shifts[512]; // 1x512x1x1
float bn42Variances[512]; // 1x512x1x1
float bn43Means[512]; // 1x512x1x1
float bn43Scales[512]; // 1x512x1x1
float bn43Shifts[512]; // 1x512x1x1
float bn43Variances[512]; // 1x512x1x1
float bn44Means[1024]; // 1x1024x1x1
float bn44Scales[1024]; // 1x1024x1x1
float bn44Shifts[1024]; // 1x1024x1x1
float bn44Variances[1024]; // 1x1024x1x1
float bn45Means[2048]; // 1x2048x1x1
float bn45Scales[2048]; // 1x2048x1x1
float bn45Shifts[2048]; // 1x2048x1x1
float bn45Variances[2048]; // 1x2048x1x1
float bn46Means[1024]; // 1x1024x1x1
float bn46Scales[1024]; // 1x1024x1x1
float bn46Shifts[1024]; // 1x1024x1x1
float bn46Variances[1024]; // 1x1024x1x1
float bn47Means[1024]; // 1x1024x1x1
float bn47Scales[1024]; // 1x1024x1x1
float bn47Shifts[1024]; // 1x1024x1x1
float bn47Variances[1024]; // 1x1024x1x1
float bn48Means[2048]; // 1x2048x1x1
float bn48Scales[2048]; // 1x2048x1x1
float bn48Shifts[2048]; // 1x2048x1x1
float bn48Variances[2048]; // 1x2048x1x1
float bn49Means[1024]; // 1x1024x1x1
float bn49Scales[1024]; // 1x1024x1x1
float bn49Shifts[1024]; // 1x1024x1x1
float bn49Variances[1024]; // 1x1024x1x1
float bn4Means[128]; // 1x128x1x1
float bn4Scales[128]; // 1x128x1x1
float bn4Shifts[128]; // 1x128x1x1
float bn4Variances[128]; // 1x128x1x1
float bn50Means[1024]; // 1x1024x1x1
float bn50Scales[1024]; // 1x1024x1x1
float bn50Shifts[1024]; // 1x1024x1x1
float bn50Variances[1024]; // 1x1024x1x1
float bn51Means[2048]; // 1x2048x1x1
float bn51Scales[2048]; // 1x2048x1x1
float bn51Shifts[2048]; // 1x2048x1x1
float bn51Variances[2048]; // 1x2048x1x1
float bn52Means[1024]; // 1x1024x1x1
float bn52Scales[1024]; // 1x1024x1x1
float bn52Shifts[1024]; // 1x1024x1x1
float bn52Variances[1024]; // 1x1024x1x1
float bn53Means[1024]; // 1x1024x1x1
float bn53Scales[1024]; // 1x1024x1x1
float bn53Shifts[1024]; // 1x1024x1x1
float bn53Variances[1024]; // 1x1024x1x1
float bn54Means[2048]; // 1x2048x1x1
float bn54Scales[2048]; // 1x2048x1x1
float bn54Shifts[2048]; // 1x2048x1x1
float bn54Variances[2048]; // 1x2048x1x1
float bn5Means[128]; // 1x128x1x1
float bn5Scales[128]; // 1x128x1x1
float bn5Shifts[128]; // 1x128x1x1
float bn5Variances[128]; // 1x128x1x1
float bn6Means[256]; // 1x256x1x1
float bn6Scales[256]; // 1x256x1x1
float bn6Shifts[256]; // 1x256x1x1
float bn6Variances[256]; // 1x256x1x1
float bn7Means[128]; // 1x128x1x1
float bn7Scales[128]; // 1x128x1x1
float bn7Shifts[128]; // 1x128x1x1
float bn7Variances[128]; // 1x128x1x1
float bn8Means[128]; // 1x128x1x1
float bn8Scales[128]; // 1x128x1x1
float bn8Shifts[128]; // 1x128x1x1
float bn8Variances[128]; // 1x128x1x1
float bn9Means[256]; // 1x256x1x1
float bn9Scales[256]; // 1x256x1x1
float bn9Shifts[256]; // 1x256x1x1
float bn9Variances[256]; // 1x256x1x1
float fcBiases[1000]; // 1x1000x1x1
float fcWeights[2048000]; // 1000x2048x1x1
float one10Biases[256]; // 1x256x1x1
float one10Weights[131072]; // 256x512x1x1
float one11Biases[512]; // 1x512x1x1
float one11Weights[131072]; // 512x256x1x1
float one12Biases[256]; // 1x256x1x1
float one12Weights[131072]; // 256x512x1x1
float one13Biases[512]; // 1x512x1x1
float one13Weights[131072]; // 512x256x1x1
float one14Biases[256]; // 1x256x1x1
float one14Weights[131072]; // 256x512x1x1
float one15Biases[512]; // 1x512x1x1
float one15Weights[131072]; // 512x256x1x1
float one16Biases[512]; // 1x512x1x1
float one16Weights[262144]; // 512x512x1x1
float one17Biases[1024]; // 1x1024x1x1
float one17Weights[524288]; // 1024x512x1x1
float one18Biases[512]; // 1x512x1x1
float one18Weights[524288]; // 512x1024x1x1
float one19Biases[1024]; // 1x1024x1x1
float one19Weights[524288]; // 1024x512x1x1
float one1Biases[256]; // 1x256x1x1
float one1Weights[16384]; // 256x64x1x1
float one20Biases[512]; // 1x512x1x1
float one20Weights[524288]; // 512x1024x1x1
float one21Biases[1024]; // 1x1024x1x1
float one21Weights[524288]; // 1024x512x1x1
float one22Biases[512]; // 1x512x1x1
float one22Weights[524288]; // 512x1024x1x1
float one23Biases[1024]; // 1x1024x1x1
float one23Weights[524288]; // 1024x512x1x1
float one24Biases[512]; // 1x512x1x1
float one24Weights[524288]; // 512x1024x1x1
float one25Biases[1024]; // 1x1024x1x1
float one25Weights[524288]; // 1024x512x1x1
float one26Biases[512]; // 1x512x1x1
float one26Weights[524288]; // 512x1024x1x1
float one27Biases[1024]; // 1x1024x1x1
float one27Weights[524288]; // 1024x512x1x1
float one28Biases[1024]; // 1x1024x1x1
float one28Weights[1048576]; // 1024x1024x1x1
float one29Biases[2048]; // 1x2048x1x1
float one29Weights[2097152]; // 2048x1024x1x1
float one2Biases[128]; // 1x128x1x1
float one2Weights[8192]; // 128x64x1x1
float one30Biases[1024]; // 1x1024x1x1
float one30Weights[2097152]; // 1024x2048x1x1
float one31Biases[2048]; // 1x2048x1x1
float one31Weights[2097152]; // 2048x1024x1x1
float one32Biases[1024]; // 1x1024x1x1
float one32Weights[2097152]; // 1024x2048x1x1
float one33Biases[2048]; // 1x2048x1x1
float one33Weights[2097152]; // 2048x1024x1x1
float one3Biases[256]; // 1x256x1x1
float one3Weights[32768]; // 256x128x1x1
float one4Biases[128]; // 1x128x1x1
float one4Weights[32768]; // 128x256x1x1
float one5Biases[256]; // 1x256x1x1
float one5Weights[32768]; // 256x128x1x1
float one6Biases[128]; // 1x128x1x1
float one6Weights[32768]; // 128x256x1x1
float one7Biases[256]; // 1x256x1x1
float one7Weights[32768]; // 256x128x1x1
float one8Biases[256]; // 1x256x1x1
float one8Weights[65536]; // 256x256x1x1
float one9Biases[512]; // 1x512x1x1
float one9Weights[131072]; // 512x256x1x1
float oneDS1Biases[512]; // 1x512x1x1
float oneDS1Weights[131072]; // 512x256x1x1
float oneDS2Biases[1024]; // 1x1024x1x1
float oneDS2Weights[524288]; // 1024x512x1x1
float oneDS3Biases[2048]; // 1x2048x1x1
float oneDS3Weights[2097152]; // 2048x1024x1x1
float sevenDSBiases[64]; // 1x64x1x1
float sevenDSWeights[9408]; // 64x3x7x7
float three10Biases[512]; // 1x512x1x1
float three10Weights[73728]; // 512x16x3x3
float three11Biases[512]; // 1x512x1x1
float three11Weights[73728]; // 512x16x3x3
float three12Biases[1024]; // 1x1024x1x1
float three12Weights[294912]; // 1024x32x3x3
float three13Biases[1024]; // 1x1024x1x1
float three13Weights[294912]; // 1024x32x3x3
float three1Biases[128]; // 1x128x1x1
float three1Weights[4608]; // 128x4x3x3
float three2Biases[128]; // 1x128x1x1
float three2Weights[4608]; // 128x4x3x3
float three3Biases[128]; // 1x128x1x1
float three3Weights[4608]; // 128x4x3x3
float three4Biases[256]; // 1x256x1x1
float three4Weights[18432]; // 256x8x3x3
float three5Biases[256]; // 1x256x1x1
float three5Weights[18432]; // 256x8x3x3
float three6Biases[256]; // 1x256x1x1
float three6Weights[18432]; // 256x8x3x3
float three7Biases[512]; // 1x512x1x1
float three7Weights[73728]; // 512x16x3x3
float three8Biases[512]; // 1x512x1x1
float three8Weights[73728]; // 512x16x3x3
float three9Biases[512]; // 1x512x1x1
float three9Weights[73728]; // 512x16x3x3
float threeDS1Biases[256]; // 1x256x1x1
float threeDS1Weights[18432]; // 256x8x3x3
float threeDS2Biases[512]; // 1x512x1x1
float threeDS2Weights[73728]; // 512x16x3x3
float threeDS3Biases[1024]; // 1x1024x1x1
float threeDS3Weights[294912]; // 1024x32x3x3
} __attribute__((packed));

#ifdef __cplusplus
/**/ }
#endif

// End of file.

Top || Output ResNeXt50.c file

// To build an object file:
// gcc -c -w -std=c99 -pthread -Ofast -mavx512f ResNeXt50.c

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <errno.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <immintrin.h>

#include "ResNeXt50.h"

static char* ResNeXt50Errmsg1(ptrdiff_t lineNum1, char* format1, ...) {
char* msg1 = malloc(277);
int step1 = sprintf(msg1, "ResNeXt50: line %td: ", lineNum1);
va_list ap1;
va_start(ap1, format1);
vsnprintf(msg1+step1, 277-step1, format1, ap1);
va_end(ap1);
return msg1;
}

typedef struct ResNeXt50ThreaderTask1 ResNeXt50ThreaderTask1;
typedef void (*ResNeXt50ThreaderCallee1)(ResNeXt50ThreaderTask1*, int64_t*);
typedef struct ResNeXt50ThreaderHub1 ResNeXt50ThreaderHub1;
typedef struct ResNeXt50ThreaderNode1 ResNeXt50ThreaderNode1;
typedef struct ResNeXt50ThreaderUnwind1 ResNeXt50ThreaderUnwind1;
typedef struct ResNeXt50ThreaderTeam1 ResNeXt50ThreaderTeam1;

struct ResNeXt50ThreaderTask1 {
ResNeXt50ThreaderCallee1 callee1;
void* any1;
ptrdiff_t nd1;
int64_t hull1[4];
};

struct ResNeXt50ThreaderHub1 {
pthread_mutex_t mut1;
pthread_cond_t cond1;
ptrdiff_t pending1;
ptrdiff_t offset1;
long mask1;
long status1[];
};

struct ResNeXt50ThreaderNode1 {
pthread_mutex_t mut2;
int64_t np1;
int64_t pt1[4];
ResNeXt50ThreaderTask1* task1;
pthread_cond_t cond2;
ResNeXt50ThreaderTeam1* team1;
pthread_t thr1;
} __attribute__((aligned(64)));

struct ResNeXt50ThreaderUnwind1 {
ptrdiff_t join1;
ptrdiff_t nodeConds1;
ptrdiff_t nodeMuts1;
ptrdiff_t hubCond1;
ptrdiff_t hubMut1;
void* nodes1;
void* hub1;
};

struct ResNeXt50ThreaderTeam1 {
ptrdiff_t nt1;
ResNeXt50ThreaderHub1* hub2;
ResNeXt50ThreaderNode1* nodes2;
ResNeXt50ThreaderUnwind1 unwind1;
};

static void ResNeXt50ThreaderInc1(
ptrdiff_t nd2,
int64_t*restrict hull2,
int64_t*restrict pt2
) {
for (ptrdiff_t i1 = 0; i1 < nd2; ++i1) {
int64_t elem1 = pt2[i1];
if (++elem1 == hull2[i1]) {
pt2[i1] = 0;
} else {
pt2[i1] = elem1;
break;
}
}
}

static void ResNeXt50ThreaderPut1(
ptrdiff_t nd3,
int64_t*restrict hull3,
int64_t*restrict pt3,
int64_t val1
) {
ptrdiff_t i2 = 0;
for (; i2 < nd3 && val1; ) {
int64_t wrap1 = hull3[i2];
int64_t carry1 = val1/wrap1;
pt3[i2++] = val1-carry1*wrap1;
val1 = carry1;
}
for (; i2 < nd3; pt3[i2++] = 0);
}

static void ResNeXt50ThreaderAdd1(
ptrdiff_t nd4,
int64_t*restrict hull4,
int64_t*restrict pt4,
int64_t*restrict plus1,
int64_t carry2
) {
for (ptrdiff_t i3 = 0; i3 < nd4; ++i3) {
int64_t wrap2 = hull4[i3];
int64_t sum1 = pt4[i3]+plus1[i3]+carry2;
if (sum1 < wrap2) {
pt4[i3] = sum1;
carry2 = 0;
} else {
pt4[i3] = sum1-wrap2;
carry2 = 1;
}
}
}

static void* ResNeXt50ThreaderMain1(void* arg1) {
ResNeXt50ThreaderNode1* node1 = arg1;
ResNeXt50ThreaderTeam1* team2 = node1->team1;
ptrdiff_t nt2 = team2->nt1;
ResNeXt50ThreaderHub1* hub3 = team2->hub2;
ResNeXt50ThreaderNode1* nodes3 = team2->nodes2;
size_t role1 = node1-nodes3;
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
for (; ; ) {
ResNeXt50ThreaderTask1* task2 = node1->task1;
if (!task2) {
for (; __builtin_expect(pthread_cond_wait(&node1->cond2, &node1->mut2), 0); );
continue;
}
int64_t np2 = node1->np1;
if (np2 < 0) {
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
return 0;
}
node1->task1 = 0;
ResNeXt50ThreaderCallee1 callee2 = task2->callee1;
ptrdiff_t nd5 = task2->nd1;
int64_t pt5[4];
for (; np2; np2 = node1->np1) {
memcpy(pt5, node1->pt1, sizeof(pt5));
node1->np1 = np2-1;
ResNeXt50ThreaderInc1(nd5, task2->hull1, node1->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[role1/(sizeof(long)*8)] &= ~((long)1<<role1%(sizeof(long)*8));
ptrdiff_t offset2 = hub3->offset1;
long mask2 = hub3->mask1;
ptrdiff_t wrapped1 = 0;
for (; ; ) {
long hand1 = hub3->status1[offset2]&mask2;
if (!hand1) {
++offset2;
mask2 = -1;
continue;
}
ptrdiff_t target1 = offset2*(sizeof(long)*8)+__builtin_ctzl(hand1);
if (target1 == nt2) {
if (wrapped1) break;
offset2 = 0;
mask2 = -1;
wrapped1 = 1;
continue;
}
hand1 &= -hand1;
hub3->offset1 = offset2;
hub3->mask1 = mask2-hand1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
ResNeXt50ThreaderNode1* node2 = nodes3+target1;
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
for (np2 = node2->np1; np2; np2 = node2->np1) {
memcpy(pt5, node2->pt1, sizeof(pt5));
node2->np1 = np2-1;
ResNeXt50ThreaderInc1(nd5, task2->hull1, node2->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[offset2] &= ~hand1;
offset2 = hub3->offset1;
mask2 = hub3->mask1;
wrapped1 = 0;
}
ptrdiff_t pending2 = --hub3->pending1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
if (!pending2) for (; __builtin_expect(pthread_cond_signal(&hub3->cond1), 0); );
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
}

static void ResNeXt50ThreaderDestroy1(ResNeXt50ThreaderTeam1* team3) {
if (!team3) return;
ResNeXt50ThreaderNode1* nodes4 = team3->nodes2;
ResNeXt50ThreaderNode1* stop1 = nodes4+team3->unwind1.join1;
for (ResNeXt50ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_lock(&node3->mut2), 0); );
node3->np1 = -1;
node3->task1 = (ResNeXt50ThreaderTask1*)1;
for (; __builtin_expect(pthread_mutex_unlock(&node3->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node3->cond2), 0); );
}
for (ResNeXt50ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_join(node3->thr1, 0), 0); );
}
stop1 = nodes4+team3->unwind1.nodeConds1;
for (ResNeXt50ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_cond_destroy(&node3->cond2), 0); );
}
stop1 = nodes4+team3->unwind1.nodeMuts1;
for (ResNeXt50ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_destroy(&node3->mut2), 0); );
}
ResNeXt50ThreaderHub1* hub4 = team3->hub2;
if (team3->unwind1.hubCond1) {
for (; __builtin_expect(pthread_cond_destroy(&hub4->cond1), 0); );
}
if (team3->unwind1.hubMut1) {
for (; __builtin_expect(pthread_mutex_destroy(&hub4->mut1), 0); );
}
free(team3->unwind1.nodes1);
free(team3->unwind1.hub1);
free(team3);
}

static char* ResNeXt50ThreaderCreate1Up4(ResNeXt50ThreaderTeam1* team8, ptrdiff_t nt7) {
ResNeXt50ThreaderNode1* nodes5 = team8->nodes2;
for (ResNeXt50ThreaderNode1* node4 = nodes5; node4 != nodes5+nt7; ++node4) {
int err2 = pthread_mutex_init(&node4->mut2, 0);
if (__builtin_expect(err2, 0)) {
char* msg2 = ResNeXt50Errmsg1(__LINE__, "errno %d", err2);
team8->unwind1.nodeMuts1 = node4-nodes5;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg2;
}
node4->task1 = 0;
int err3 = pthread_cond_init(&node4->cond2, 0);
if (__builtin_expect(err3, 0)) {
char* msg3 = ResNeXt50Errmsg1(__LINE__, "errno %d", err3);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg3;
}
node4->team1 = team8;
int err4 = pthread_create(&node4->thr1, 0, ResNeXt50ThreaderMain1, node4);
if (__builtin_expect(err4, 0)) {
char* msg4 = ResNeXt50Errmsg1(__LINE__, "errno %d", err4);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5+1;
team8->unwind1.join1 = node4-nodes5;
return msg4;
}
}
team8->unwind1.nodeMuts1 = nt7;
team8->unwind1.nodeConds1 = nt7;
team8->unwind1.join1 = nt7;
return 0;
}

static char* ResNeXt50ThreaderCreate1Up3(ResNeXt50ThreaderTeam1* team7, ptrdiff_t nt6) {
ResNeXt50ThreaderHub1* hub5 = team7->hub2;
int err5 = pthread_mutex_init(&hub5->mut1, 0);
if (__builtin_expect(err5, 0)) {
return ResNeXt50Errmsg1(__LINE__, "errno %d", err5);
}
team7->unwind1.hubMut1 = 1;
int err6 = pthread_cond_init(&hub5->cond1, 0);
if (__builtin_expect(err6, 0)) {
return ResNeXt50Errmsg1(__LINE__, "errno %d", err6);
}
team7->unwind1.hubCond1 = 1;
return ResNeXt50ThreaderCreate1Up4(team7, nt6);
}

static char* ResNeXt50ThreaderCreate1Up2(ResNeXt50ThreaderTeam1* team6, ptrdiff_t nt5) {
size_t size2 = nt5*sizeof(ResNeXt50ThreaderNode1);
if (__builtin_expect(size2/sizeof(ResNeXt50ThreaderNode1) != (size_t)nt5, 0)) {
return ResNeXt50Errmsg1(__LINE__, "too many threads");
}
void* addr3 = malloc(size2+63);
if (__builtin_expect(!addr3, 0)) {
return ResNeXt50Errmsg1(__LINE__, "errno %d", errno);
}
team6->unwind1.nodes1 = addr3;
team6->nodes2 = (void*)(((size_t)addr3+63)&-64);
return ResNeXt50ThreaderCreate1Up3(team6, nt5);
}

static char* ResNeXt50ThreaderCreate1Up1(ResNeXt50ThreaderTeam1* team5, ptrdiff_t nt4) {
team5->nt1 = nt4;
size_t size1 = sizeof(ResNeXt50ThreaderHub1);
size1 += sizeof(long)*((size_t)nt4/(sizeof(long)*8)+1);
size1 = (size1+63)&-64;
void* addr2 = malloc(size1+63);
if (__builtin_expect(!addr2, 0)) {
return ResNeXt50Errmsg1(__LINE__, "errno %d", errno);
}
team5->unwind1.hub1 = addr2;
team5->hub2 = (void*)(((size_t)addr2+63)&-64);
return ResNeXt50ThreaderCreate1Up2(team5, nt4);
}

static char* ResNeXt50ThreaderCreate1(ResNeXt50ThreaderTeam1** team4, ptrdiff_t nt3) {
if (__builtin_expect(nt3 < 1, 0)) {
return ResNeXt50Errmsg1(__LINE__, "too few threads");
}
void* addr1 = calloc(1, sizeof(ResNeXt50ThreaderTeam1));
if (__builtin_expect(!addr1, 0)) {
return ResNeXt50Errmsg1(__LINE__, "errno %d", errno);
}
char* err1 = ResNeXt50ThreaderCreate1Up1(addr1, nt3);
if (__builtin_expect(!!err1, 0)) {
ResNeXt50ThreaderDestroy1(addr1);
} else {
*team4 = addr1;
}
return err1;
}

static char* ResNeXt50ThreaderPthreadT1(
pthread_t* thr2,
ResNeXt50ThreaderTeam1* team9,
ptrdiff_t idx1
) {
if (__builtin_expect(idx1 < 0 || idx1 >= team9->nt1, 0)) {
return ResNeXt50Errmsg1(__LINE__, "bad thread idx");
}
*thr2 = team9->nodes2[idx1].thr1;
return 0;
}

static void ResNeXt50ThreaderDo1(ResNeXt50ThreaderTeam1* team10, ResNeXt50ThreaderTask1* task3) {
ptrdiff_t nd6 = task3->nd1;
if (nd6 < 1) return;
int64_t tot1 = task3->hull1[0];
for (ptrdiff_t i4 = 1; i4 < nd6; tot1 *= task3->hull1[i4++]);
ptrdiff_t nt8 = team10->nt1;
int64_t each1 = tot1/nt8;
ptrdiff_t more1 = tot1%nt8;
int64_t plus2[4];
ResNeXt50ThreaderPut1(nd6, task3->hull1, plus2, each1);
int64_t pt6[4] = {0};
ResNeXt50ThreaderHub1* hub6 = team10->hub2;
for (; __builtin_expect(pthread_mutex_lock(&hub6->mut1), 0); );
ResNeXt50ThreaderNode1* node5 = team10->nodes2;
for (ptrdiff_t i4 = 0; ; ++node5) {
for (; __builtin_expect(pthread_mutex_lock(&node5->mut2), 0); );
int64_t carry3 = i4 < more1;
node5->np1 = each1+carry3;
memcpy(node5->pt1, pt6, sizeof(pt6));
node5->task1 = task3;
for (; __builtin_expect(pthread_mutex_unlock(&node5->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node5->cond2), 0); );
if (++i4 == nt8) break;
ResNeXt50ThreaderAdd1(nd6, task3->hull1, pt6, plus2, carry3);
}
hub6->offset1 = 0;
hub6->mask1 = -1;
for (ptrdiff_t i4 = (size_t)nt8/(sizeof(long)*8); i4 >= 0; ) {
hub6->status1[i4--] = -1;
}
for (hub6->pending1 = nt8; hub6->pending1; ) {
for (; __builtin_expect(pthread_cond_wait(&hub6->cond1, &hub6->mut1), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&hub6->mut1), 0); );
}

static __m512 ResNeXt50Exp1(__m512 x1) {
x1 = _mm512_max_ps(x1, _mm512_set1_ps(-8.733654e+01f));
x1 = _mm512_min_ps(x1, _mm512_set1_ps(8.872284e+01f));
__m512 t1 = _mm512_mul_ps(x1, _mm512_set1_ps(1.442695e+00f));
__m512 r1 = _mm512_roundscale_ps(t1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512 f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-6.9314575e-01f), x1);
f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-1.4286068e-06f), f1);
__m512 g1 = _mm512_set1_ps(4.194439e-02f);
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(1.6800667e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(4.9999994e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.999569e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.9999964e-01f));
__m512i y1 = _mm512_slli_epi32(_mm512_cvtps_epi32(t1), 23);
return _mm512_castsi512_ps(_mm512_add_epi32(y1, _mm512_castps_si512(g1)));
}

static void ResNeXt50Softmax1(ResNeXt50ThreaderTeam1* team99, char** tensors173) {
(void)team99;
char*restrict ptr5 = tensors173[0];
char*restrict ptr6 = tensors173[1];
__m512 max1 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*0);
__m512 max2 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*1);
__m512 max3 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*2);
__m512 max4 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*3);
__m512 max5 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*4);
__m512 max6 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*5);
__m512 max7 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*6);
__m512 max8 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*7);
__m512 max9 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*8);
__m512 max10 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*9);
__m512 max11 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*10);
__m512 max12 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*11);
__m512 max13 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*12);
__m512 max14 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*13);
__m512 max15 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*14);
__m512 max16 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*15);
for (ptrdiff_t i105 = 1; i105 <= 2; ++i105) {
__m512 dat2714 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*0+(ptrdiff_t)64*16*i105);
__m512 dat2715 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*1+(ptrdiff_t)64*16*i105);
__m512 dat2716 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*2+(ptrdiff_t)64*16*i105);
__m512 dat2717 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*3+(ptrdiff_t)64*16*i105);
__m512 dat2718 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*4+(ptrdiff_t)64*16*i105);
__m512 dat2719 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*5+(ptrdiff_t)64*16*i105);
__m512 dat2720 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*6+(ptrdiff_t)64*16*i105);
__m512 dat2721 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*7+(ptrdiff_t)64*16*i105);
__m512 dat2722 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*8+(ptrdiff_t)64*16*i105);
__m512 dat2723 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*9+(ptrdiff_t)64*16*i105);
__m512 dat2724 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*10+(ptrdiff_t)64*16*i105);
__m512 dat2725 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*11+(ptrdiff_t)64*16*i105);
__m512 dat2726 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*12+(ptrdiff_t)64*16*i105);
__m512 dat2727 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*13+(ptrdiff_t)64*16*i105);
__m512 dat2728 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*14+(ptrdiff_t)64*16*i105);
__m512 dat2729 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*15+(ptrdiff_t)64*16*i105);
max1 = _mm512_max_ps(max1, dat2714);
max2 = _mm512_max_ps(max2, dat2715);
max3 = _mm512_max_ps(max3, dat2716);
max4 = _mm512_max_ps(max4, dat2717);
max5 = _mm512_max_ps(max5, dat2718);
max6 = _mm512_max_ps(max6, dat2719);
max7 = _mm512_max_ps(max7, dat2720);
max8 = _mm512_max_ps(max8, dat2721);
max9 = _mm512_max_ps(max9, dat2722);
max10 = _mm512_max_ps(max10, dat2723);
max11 = _mm512_max_ps(max11, dat2724);
max12 = _mm512_max_ps(max12, dat2725);
max13 = _mm512_max_ps(max13, dat2726);
max14 = _mm512_max_ps(max14, dat2727);
max15 = _mm512_max_ps(max15, dat2728);
max16 = _mm512_max_ps(max16, dat2729);
}
__m512 dat2730 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*0+(ptrdiff_t)64*16*3);
__m512 dat2731 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*1+(ptrdiff_t)64*16*3);
__m512 dat2732 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*2+(ptrdiff_t)64*16*3);
__m512 dat2733 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*3+(ptrdiff_t)64*16*3);
__m512 dat2734 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*4+(ptrdiff_t)64*16*3);
__m512 dat2735 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*5+(ptrdiff_t)64*16*3);
__m512 dat2736 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*6+(ptrdiff_t)64*16*3);
__m512 dat2737 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*7+(ptrdiff_t)64*16*3);
__m512 dat2738 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*8+(ptrdiff_t)64*16*3);
__m512 dat2739 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*9+(ptrdiff_t)64*16*3);
__m512 dat2740 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*10+(ptrdiff_t)64*16*3);
__m512 dat2741 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*11+(ptrdiff_t)64*16*3);
__m512 dat2742 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*12+(ptrdiff_t)64*16*3);
__m512 dat2743 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*13+(ptrdiff_t)64*16*3);
max1 = _mm512_max_ps(max1, dat2730);
max2 = _mm512_max_ps(max2, dat2731);
max3 = _mm512_max_ps(max3, dat2732);
max4 = _mm512_max_ps(max4, dat2733);
max5 = _mm512_max_ps(max5, dat2734);
max6 = _mm512_max_ps(max6, dat2735);
max7 = _mm512_max_ps(max7, dat2736);
max8 = _mm512_max_ps(max8, dat2737);
max9 = _mm512_max_ps(max9, dat2738);
max10 = _mm512_max_ps(max10, dat2739);
max11 = _mm512_max_ps(max11, dat2740);
max12 = _mm512_max_ps(max12, dat2741);
max13 = _mm512_max_ps(max13, dat2742);
max14 = _mm512_max_ps(max14, dat2743);
__m512 dat2744 = _mm512_maskz_loadu_ps(255, ptr5+(ptrdiff_t)64*62);
max16 = _mm512_mask_max_ps(max16, 255, max16, dat2744);
max1 = _mm512_max_ps(max1, max9);
max2 = _mm512_max_ps(max2, max10);
max3 = _mm512_max_ps(max3, max11);
max4 = _mm512_max_ps(max4, max12);
max5 = _mm512_max_ps(max5, max13);
max6 = _mm512_max_ps(max6, max14);
max7 = _mm512_max_ps(max7, max15);
max8 = _mm512_max_ps(max8, max16);
max1 = _mm512_max_ps(max1, max5);
max2 = _mm512_max_ps(max2, max6);
max3 = _mm512_max_ps(max3, max7);
max4 = _mm512_max_ps(max4, max8);
max1 = _mm512_max_ps(max1, max3);
max2 = _mm512_max_ps(max2, max4);
max1 = _mm512_max_ps(max1, max2);
__m512i p5 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8);
max1 = _mm512_mask_max_ps(max1, 255, max1, _mm512_permutexvar_ps(p5, max1));
__m512i p6 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4);
max1 = _mm512_mask_max_ps(max1, 15, max1, _mm512_permutexvar_ps(p6, max1));
__m512i p7 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2);
max1 = _mm512_mask_max_ps(max1, 3, max1, _mm512_permutexvar_ps(p7, max1));
__m512i p8 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
max1 = _mm512_mask_max_ps(max1, 1, max1, _mm512_permutexvar_ps(p8, max1));
__m512i p9 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
max1 = _mm512_permutexvar_ps(p9, max1);
__m512 sum917 = _mm512_setzero_ps();
__m512 neg1 = _mm512_sub_ps(sum917, max1);
__m512 dat2775 = _mm512_maskz_loadu_ps(255, ptr5+(ptrdiff_t)64*14+(ptrdiff_t)64*16*3);
__m512 dat2774 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*13+(ptrdiff_t)64*16*3);
__m512 dat2773 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*12+(ptrdiff_t)64*16*3);
__m512 dat2772 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*11+(ptrdiff_t)64*16*3);
__m512 dat2771 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*10+(ptrdiff_t)64*16*3);
__m512 dat2770 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*9+(ptrdiff_t)64*16*3);
__m512 dat2769 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*8+(ptrdiff_t)64*16*3);
__m512 dat2768 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*7+(ptrdiff_t)64*16*3);
__m512 dat2767 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*6+(ptrdiff_t)64*16*3);
__m512 dat2766 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*5+(ptrdiff_t)64*16*3);
__m512 dat2765 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*4+(ptrdiff_t)64*16*3);
__m512 dat2764 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*3+(ptrdiff_t)64*16*3);
__m512 dat2763 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*2+(ptrdiff_t)64*16*3);
__m512 dat2762 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*1+(ptrdiff_t)64*16*3);
__m512 dat2761 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*0+(ptrdiff_t)64*16*3);
dat2775 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2775));
sum917 = _mm512_mask_add_ps(sum917, 255, sum917, dat2775);
dat2774 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2774));
sum917 = _mm512_add_ps(sum917, dat2774);
dat2773 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2773));
sum917 = _mm512_add_ps(sum917, dat2773);
dat2772 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2772));
sum917 = _mm512_add_ps(sum917, dat2772);
dat2771 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2771));
sum917 = _mm512_add_ps(sum917, dat2771);
dat2770 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2770));
sum917 = _mm512_add_ps(sum917, dat2770);
dat2769 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2769));
sum917 = _mm512_add_ps(sum917, dat2769);
dat2768 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2768));
sum917 = _mm512_add_ps(sum917, dat2768);
dat2767 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2767));
sum917 = _mm512_add_ps(sum917, dat2767);
dat2766 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2766));
sum917 = _mm512_add_ps(sum917, dat2766);
dat2765 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2765));
sum917 = _mm512_add_ps(sum917, dat2765);
dat2764 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2764));
sum917 = _mm512_add_ps(sum917, dat2764);
dat2763 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2763));
sum917 = _mm512_add_ps(sum917, dat2763);
dat2762 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2762));
sum917 = _mm512_add_ps(sum917, dat2762);
dat2761 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2761));
sum917 = _mm512_add_ps(sum917, dat2761);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*14+(ptrdiff_t)64*16*3, 255, dat2775);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*13+(ptrdiff_t)64*16*3, 65535, dat2774);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*12+(ptrdiff_t)64*16*3, 65535, dat2773);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*11+(ptrdiff_t)64*16*3, 65535, dat2772);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*10+(ptrdiff_t)64*16*3, 65535, dat2771);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*9+(ptrdiff_t)64*16*3, 65535, dat2770);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*8+(ptrdiff_t)64*16*3, 65535, dat2769);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*7+(ptrdiff_t)64*16*3, 65535, dat2768);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*6+(ptrdiff_t)64*16*3, 65535, dat2767);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*5+(ptrdiff_t)64*16*3, 65535, dat2766);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*4+(ptrdiff_t)64*16*3, 65535, dat2765);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*3+(ptrdiff_t)64*16*3, 65535, dat2764);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*2+(ptrdiff_t)64*16*3, 65535, dat2763);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*1+(ptrdiff_t)64*16*3, 65535, dat2762);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*0+(ptrdiff_t)64*16*3, 65535, dat2761);
for (ptrdiff_t i106 = 2; i106 >= 0; --i106) {
__m512 dat2760 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*15+(ptrdiff_t)64*16*i106);
__m512 dat2759 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*14+(ptrdiff_t)64*16*i106);
__m512 dat2758 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*13+(ptrdiff_t)64*16*i106);
__m512 dat2757 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*12+(ptrdiff_t)64*16*i106);
__m512 dat2756 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*11+(ptrdiff_t)64*16*i106);
__m512 dat2755 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*10+(ptrdiff_t)64*16*i106);
__m512 dat2754 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*9+(ptrdiff_t)64*16*i106);
__m512 dat2753 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*8+(ptrdiff_t)64*16*i106);
__m512 dat2752 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*7+(ptrdiff_t)64*16*i106);
__m512 dat2751 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*6+(ptrdiff_t)64*16*i106);
__m512 dat2750 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*5+(ptrdiff_t)64*16*i106);
__m512 dat2749 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*4+(ptrdiff_t)64*16*i106);
__m512 dat2748 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*3+(ptrdiff_t)64*16*i106);
__m512 dat2747 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*2+(ptrdiff_t)64*16*i106);
__m512 dat2746 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*1+(ptrdiff_t)64*16*i106);
__m512 dat2745 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*0+(ptrdiff_t)64*16*i106);
dat2760 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2760));
sum917 = _mm512_add_ps(sum917, dat2760);
dat2759 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2759));
sum917 = _mm512_add_ps(sum917, dat2759);
dat2758 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2758));
sum917 = _mm512_add_ps(sum917, dat2758);
dat2757 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2757));
sum917 = _mm512_add_ps(sum917, dat2757);
dat2756 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2756));
sum917 = _mm512_add_ps(sum917, dat2756);
dat2755 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2755));
sum917 = _mm512_add_ps(sum917, dat2755);
dat2754 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2754));
sum917 = _mm512_add_ps(sum917, dat2754);
dat2753 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2753));
sum917 = _mm512_add_ps(sum917, dat2753);
dat2752 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2752));
sum917 = _mm512_add_ps(sum917, dat2752);
dat2751 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2751));
sum917 = _mm512_add_ps(sum917, dat2751);
dat2750 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2750));
sum917 = _mm512_add_ps(sum917, dat2750);
dat2749 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2749));
sum917 = _mm512_add_ps(sum917, dat2749);
dat2748 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2748));
sum917 = _mm512_add_ps(sum917, dat2748);
dat2747 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2747));
sum917 = _mm512_add_ps(sum917, dat2747);
dat2746 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2746));
sum917 = _mm512_add_ps(sum917, dat2746);
dat2745 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2745));
sum917 = _mm512_add_ps(sum917, dat2745);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*15+(ptrdiff_t)64*16*i106, 65535, dat2760);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*14+(ptrdiff_t)64*16*i106, 65535, dat2759);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*13+(ptrdiff_t)64*16*i106, 65535, dat2758);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*12+(ptrdiff_t)64*16*i106, 65535, dat2757);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*11+(ptrdiff_t)64*16*i106, 65535, dat2756);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*10+(ptrdiff_t)64*16*i106, 65535, dat2755);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*9+(ptrdiff_t)64*16*i106, 65535, dat2754);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*8+(ptrdiff_t)64*16*i106, 65535, dat2753);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*7+(ptrdiff_t)64*16*i106, 65535, dat2752);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*6+(ptrdiff_t)64*16*i106, 65535, dat2751);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*5+(ptrdiff_t)64*16*i106, 65535, dat2750);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*4+(ptrdiff_t)64*16*i106, 65535, dat2749);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*3+(ptrdiff_t)64*16*i106, 65535, dat2748);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*2+(ptrdiff_t)64*16*i106, 65535, dat2747);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*1+(ptrdiff_t)64*16*i106, 65535, dat2746);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*0+(ptrdiff_t)64*16*i106, 65535, dat2745);
}
__m512i p10 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8);
sum917 = _mm512_mask_add_ps(sum917, 255, sum917, _mm512_permutexvar_ps(p10, sum917));
__m512i p11 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4);
sum917 = _mm512_mask_add_ps(sum917, 15, sum917, _mm512_permutexvar_ps(p11, sum917));
__m512i p12 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2);
sum917 = _mm512_mask_add_ps(sum917, 3, sum917, _mm512_permutexvar_ps(p12, sum917));
__m512i p13 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
sum917 = _mm512_mask_add_ps(sum917, 1, sum917, _mm512_permutexvar_ps(p13, sum917));
__m512i p14 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
sum917 = _mm512_permutexvar_ps(p14, sum917);
__m512 rcp44 = _mm512_div_ps(_mm512_set1_ps(1e+00f), sum917);
for (ptrdiff_t i107 = 0; i107 < 62; ++i107) {
__m512 dat2776 = _mm512_maskz_loadu_ps(65535, ptr6+(ptrdiff_t)64*i107);
dat2776 = _mm512_mul_ps(rcp44, dat2776);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*i107, 65535, dat2776);
}
__m512 dat2777 = _mm512_maskz_loadu_ps(255, ptr6+(ptrdiff_t)64*62);
dat2777 = _mm512_mul_ps(rcp44, dat2777);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*62, 255, dat2777);
}

static __m512 ResNeXt50Rsqrt1(__m512 x2) {
__m512 y2 = _mm512_rsqrt14_ps(x2);
__m512 z1 = _mm512_mul_ps(x2, y2);
__m512 a1 = _mm512_mul_ps(y2, _mm512_set1_ps(5e-01f));
__m512 b1 = _mm512_fnmadd_ps(y2, z1, _mm512_set1_ps(3e+00f));
return _mm512_mul_ps(a1, b1);
}

static void ResNeXt50BnSimplify1(
float*restrict means1,
float*restrict variances1,
float*restrict scales1,
float*restrict shifts1,
char*restrict mas1
) {
__m512 eps1 = _mm512_set1_ps(2e-05f);
__m512i xlo1 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512 va1 = _mm512_maskz_loadu_ps(7, variances1+(ptrdiff_t)16*0);
__m512 rcp1 = ResNeXt50Rsqrt1(_mm512_add_ps(eps1, va1));
__m512 sc1 = _mm512_maskz_loadu_ps(7, scales1+(ptrdiff_t)16*0);
__m512 mul1 = _mm512_mul_ps(rcp1, sc1);
__m512 me1 = _mm512_maskz_loadu_ps(7, means1+(ptrdiff_t)16*0);
__m512 sh1 = _mm512_maskz_loadu_ps(7, shifts1+(ptrdiff_t)16*0);
__m512 add1 = _mm512_fnmadd_ps(me1, mul1, sh1);
__m512 lo1 = _mm512_permutex2var_ps(mul1, xlo1, add1);
_mm512_mask_storeu_ps(mas1+(ptrdiff_t)64*0, 63, lo1);
}

static void ResNeXt50BnSimplify2(
float*restrict means2,
float*restrict variances2,
float*restrict scales2,
float*restrict shifts2,
char*restrict mas2
) {
__m512 eps2 = _mm512_set1_ps(2e-05f);
__m512i xlo2 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi1 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
__m512 va2 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*0);
__m512 va3 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*1);
__m512 va4 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*2);
__m512 va5 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*3);
__m512 rcp2 = ResNeXt50Rsqrt1(_mm512_add_ps(eps2, va2));
__m512 rcp3 = ResNeXt50Rsqrt1(_mm512_add_ps(eps2, va3));
__m512 rcp4 = ResNeXt50Rsqrt1(_mm512_add_ps(eps2, va4));
__m512 rcp5 = ResNeXt50Rsqrt1(_mm512_add_ps(eps2, va5));
__m512 sc2 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*0);
__m512 sc3 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*1);
__m512 sc4 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*2);
__m512 sc5 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*3);
__m512 mul2 = _mm512_mul_ps(rcp2, sc2);
__m512 mul3 = _mm512_mul_ps(rcp3, sc3);
__m512 mul4 = _mm512_mul_ps(rcp4, sc4);
__m512 mul5 = _mm512_mul_ps(rcp5, sc5);
__m512 me2 = _mm512_loadu_ps(means2+(ptrdiff_t)16*0);
__m512 me3 = _mm512_loadu_ps(means2+(ptrdiff_t)16*1);
__m512 me4 = _mm512_loadu_ps(means2+(ptrdiff_t)16*2);
__m512 me5 = _mm512_loadu_ps(means2+(ptrdiff_t)16*3);
__m512 sh2 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*0);
__m512 sh3 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*1);
__m512 sh4 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*2);
__m512 sh5 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*3);
__m512 add2 = _mm512_fnmadd_ps(me2, mul2, sh2);
__m512 add3 = _mm512_fnmadd_ps(me3, mul3, sh3);
__m512 add4 = _mm512_fnmadd_ps(me4, mul4, sh4);
__m512 add5 = _mm512_fnmadd_ps(me5, mul5, sh5);
__m512 lo2 = _mm512_permutex2var_ps(mul2, xlo2, add2);
__m512 lo3 = _mm512_permutex2var_ps(mul3, xlo2, add3);
__m512 lo4 = _mm512_permutex2var_ps(mul4, xlo2, add4);
__m512 lo5 = _mm512_permutex2var_ps(mul5, xlo2, add5);
__m512 hi1 = _mm512_permutex2var_ps(mul2, xhi1, add2);
__m512 hi2 = _mm512_permutex2var_ps(mul3, xhi1, add3);
__m512 hi3 = _mm512_permutex2var_ps(mul4, xhi1, add4);
__m512 hi4 = _mm512_permutex2var_ps(mul5, xhi1, add5);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*0, lo2);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*1, hi1);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*2, lo3);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*3, hi2);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*4, lo4);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*5, hi3);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*6, lo5);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*7, hi4);
}

static void ResNeXt50BnSimplify3(
float*restrict means3,
float*restrict variances3,
float*restrict scales3,
float*restrict shifts3,
char*restrict mas4
) {
__m512 eps3 = _mm512_set1_ps(2e-05f);
__m512i xlo3 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi2 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i11 = 0; i11 < 3; ++i11) {
__m512 va6 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*0+(ptrdiff_t)80*i11);
__m512 va7 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*1+(ptrdiff_t)80*i11);
__m512 va8 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*2+(ptrdiff_t)80*i11);
__m512 va9 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*3+(ptrdiff_t)80*i11);
__m512 va10 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*4+(ptrdiff_t)80*i11);
__m512 rcp6 = ResNeXt50Rsqrt1(_mm512_add_ps(eps3, va6));
__m512 rcp7 = ResNeXt50Rsqrt1(_mm512_add_ps(eps3, va7));
__m512 rcp8 = ResNeXt50Rsqrt1(_mm512_add_ps(eps3, va8));
__m512 rcp9 = ResNeXt50Rsqrt1(_mm512_add_ps(eps3, va9));
__m512 rcp10 = ResNeXt50Rsqrt1(_mm512_add_ps(eps3, va10));
__m512 sc6 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*0+(ptrdiff_t)80*i11);
__m512 sc7 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*1+(ptrdiff_t)80*i11);
__m512 sc8 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*2+(ptrdiff_t)80*i11);
__m512 sc9 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*3+(ptrdiff_t)80*i11);
__m512 sc10 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*4+(ptrdiff_t)80*i11);
__m512 mul6 = _mm512_mul_ps(rcp6, sc6);
__m512 mul7 = _mm512_mul_ps(rcp7, sc7);
__m512 mul8 = _mm512_mul_ps(rcp8, sc8);
__m512 mul9 = _mm512_mul_ps(rcp9, sc9);
__m512 mul10 = _mm512_mul_ps(rcp10, sc10);
__m512 me6 = _mm512_loadu_ps(means3+(ptrdiff_t)16*0+(ptrdiff_t)80*i11);
__m512 me7 = _mm512_loadu_ps(means3+(ptrdiff_t)16*1+(ptrdiff_t)80*i11);
__m512 me8 = _mm512_loadu_ps(means3+(ptrdiff_t)16*2+(ptrdiff_t)80*i11);
__m512 me9 = _mm512_loadu_ps(means3+(ptrdiff_t)16*3+(ptrdiff_t)80*i11);
__m512 me10 = _mm512_loadu_ps(means3+(ptrdiff_t)16*4+(ptrdiff_t)80*i11);
__m512 sh6 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*0+(ptrdiff_t)80*i11);
__m512 sh7 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*1+(ptrdiff_t)80*i11);
__m512 sh8 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*2+(ptrdiff_t)80*i11);
__m512 sh9 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*3+(ptrdiff_t)80*i11);
__m512 sh10 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*4+(ptrdiff_t)80*i11);
__m512 add6 = _mm512_fnmadd_ps(me6, mul6, sh6);
__m512 add7 = _mm512_fnmadd_ps(me7, mul7, sh7);
__m512 add8 = _mm512_fnmadd_ps(me8, mul8, sh8);
__m512 add9 = _mm512_fnmadd_ps(me9, mul9, sh9);
__m512 add10 = _mm512_fnmadd_ps(me10, mul10, sh10);
__m512 lo6 = _mm512_permutex2var_ps(mul6, xlo3, add6);
__m512 lo7 = _mm512_permutex2var_ps(mul7, xlo3, add7);
__m512 lo8 = _mm512_permutex2var_ps(mul8, xlo3, add8);
__m512 lo9 = _mm512_permutex2var_ps(mul9, xlo3, add9);
__m512 lo10 = _mm512_permutex2var_ps(mul10, xlo3, add10);
__m512 hi5 = _mm512_permutex2var_ps(mul6, xhi2, add6);
__m512 hi6 = _mm512_permutex2var_ps(mul7, xhi2, add7);
__m512 hi7 = _mm512_permutex2var_ps(mul8, xhi2, add8);
__m512 hi8 = _mm512_permutex2var_ps(mul9, xhi2, add9);
__m512 hi9 = _mm512_permutex2var_ps(mul10, xhi2, add10);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*0+(ptrdiff_t)640*i11, lo6);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*1+(ptrdiff_t)640*i11, hi5);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*2+(ptrdiff_t)640*i11, lo7);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*3+(ptrdiff_t)640*i11, hi6);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*4+(ptrdiff_t)640*i11, lo8);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*5+(ptrdiff_t)640*i11, hi7);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*6+(ptrdiff_t)640*i11, lo9);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*7+(ptrdiff_t)640*i11, hi8);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*8+(ptrdiff_t)640*i11, lo10);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*9+(ptrdiff_t)640*i11, hi9);
}
__m512 va11 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 rcp11 = ResNeXt50Rsqrt1(_mm512_add_ps(eps3, va11));
__m512 sc11 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 mul11 = _mm512_mul_ps(rcp11, sc11);
__m512 me11 = _mm512_loadu_ps(means3+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 sh11 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 add11 = _mm512_fnmadd_ps(me11, mul11, sh11);
__m512 lo11 = _mm512_permutex2var_ps(mul11, xlo3, add11);
__m512 hi10 = _mm512_permutex2var_ps(mul11, xhi2, add11);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*0+(ptrdiff_t)640*3, lo11);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*1+(ptrdiff_t)640*3, hi10);
}

static void ResNeXt50BnSimplify4(
float*restrict means4,
float*restrict variances4,
float*restrict scales4,
float*restrict shifts4,
char*restrict mas5
) {
__m512 eps4 = _mm512_set1_ps(2e-05f);
__m512i xlo4 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi3 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i12 = 0; i12 < 1; ++i12) {
__m512 va12 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*0+(ptrdiff_t)80*i12);
__m512 va13 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*1+(ptrdiff_t)80*i12);
__m512 va14 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*2+(ptrdiff_t)80*i12);
__m512 va15 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*3+(ptrdiff_t)80*i12);
__m512 va16 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*4+(ptrdiff_t)80*i12);
__m512 rcp12 = ResNeXt50Rsqrt1(_mm512_add_ps(eps4, va12));
__m512 rcp13 = ResNeXt50Rsqrt1(_mm512_add_ps(eps4, va13));
__m512 rcp14 = ResNeXt50Rsqrt1(_mm512_add_ps(eps4, va14));
__m512 rcp15 = ResNeXt50Rsqrt1(_mm512_add_ps(eps4, va15));
__m512 rcp16 = ResNeXt50Rsqrt1(_mm512_add_ps(eps4, va16));
__m512 sc12 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*0+(ptrdiff_t)80*i12);
__m512 sc13 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*1+(ptrdiff_t)80*i12);
__m512 sc14 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*2+(ptrdiff_t)80*i12);
__m512 sc15 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*3+(ptrdiff_t)80*i12);
__m512 sc16 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*4+(ptrdiff_t)80*i12);
__m512 mul12 = _mm512_mul_ps(rcp12, sc12);
__m512 mul13 = _mm512_mul_ps(rcp13, sc13);
__m512 mul14 = _mm512_mul_ps(rcp14, sc14);
__m512 mul15 = _mm512_mul_ps(rcp15, sc15);
__m512 mul16 = _mm512_mul_ps(rcp16, sc16);
__m512 me12 = _mm512_loadu_ps(means4+(ptrdiff_t)16*0+(ptrdiff_t)80*i12);
__m512 me13 = _mm512_loadu_ps(means4+(ptrdiff_t)16*1+(ptrdiff_t)80*i12);
__m512 me14 = _mm512_loadu_ps(means4+(ptrdiff_t)16*2+(ptrdiff_t)80*i12);
__m512 me15 = _mm512_loadu_ps(means4+(ptrdiff_t)16*3+(ptrdiff_t)80*i12);
__m512 me16 = _mm512_loadu_ps(means4+(ptrdiff_t)16*4+(ptrdiff_t)80*i12);
__m512 sh12 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*0+(ptrdiff_t)80*i12);
__m512 sh13 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*1+(ptrdiff_t)80*i12);
__m512 sh14 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*2+(ptrdiff_t)80*i12);
__m512 sh15 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*3+(ptrdiff_t)80*i12);
__m512 sh16 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*4+(ptrdiff_t)80*i12);
__m512 add12 = _mm512_fnmadd_ps(me12, mul12, sh12);
__m512 add13 = _mm512_fnmadd_ps(me13, mul13, sh13);
__m512 add14 = _mm512_fnmadd_ps(me14, mul14, sh14);
__m512 add15 = _mm512_fnmadd_ps(me15, mul15, sh15);
__m512 add16 = _mm512_fnmadd_ps(me16, mul16, sh16);
__m512 lo12 = _mm512_permutex2var_ps(mul12, xlo4, add12);
__m512 lo13 = _mm512_permutex2var_ps(mul13, xlo4, add13);
__m512 lo14 = _mm512_permutex2var_ps(mul14, xlo4, add14);
__m512 lo15 = _mm512_permutex2var_ps(mul15, xlo4, add15);
__m512 lo16 = _mm512_permutex2var_ps(mul16, xlo4, add16);
__m512 hi11 = _mm512_permutex2var_ps(mul12, xhi3, add12);
__m512 hi12 = _mm512_permutex2var_ps(mul13, xhi3, add13);
__m512 hi13 = _mm512_permutex2var_ps(mul14, xhi3, add14);
__m512 hi14 = _mm512_permutex2var_ps(mul15, xhi3, add15);
__m512 hi15 = _mm512_permutex2var_ps(mul16, xhi3, add16);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*0+(ptrdiff_t)640*i12, lo12);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*1+(ptrdiff_t)640*i12, hi11);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*2+(ptrdiff_t)640*i12, lo13);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*3+(ptrdiff_t)640*i12, hi12);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*4+(ptrdiff_t)640*i12, lo14);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*5+(ptrdiff_t)640*i12, hi13);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*6+(ptrdiff_t)640*i12, lo15);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*7+(ptrdiff_t)640*i12, hi14);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*8+(ptrdiff_t)640*i12, lo16);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*9+(ptrdiff_t)640*i12, hi15);
}
__m512 va17 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 va18 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*1+(ptrdiff_t)80*1);
__m512 va19 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*2+(ptrdiff_t)80*1);
__m512 rcp17 = ResNeXt50Rsqrt1(_mm512_add_ps(eps4, va17));
__m512 rcp18 = ResNeXt50Rsqrt1(_mm512_add_ps(eps4, va18));
__m512 rcp19 = ResNeXt50Rsqrt1(_mm512_add_ps(eps4, va19));
__m512 sc17 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 sc18 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*1+(ptrdiff_t)80*1);
__m512 sc19 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*2+(ptrdiff_t)80*1);
__m512 mul17 = _mm512_mul_ps(rcp17, sc17);
__m512 mul18 = _mm512_mul_ps(rcp18, sc18);
__m512 mul19 = _mm512_mul_ps(rcp19, sc19);
__m512 me17 = _mm512_loadu_ps(means4+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 me18 = _mm512_loadu_ps(means4+(ptrdiff_t)16*1+(ptrdiff_t)80*1);
__m512 me19 = _mm512_loadu_ps(means4+(ptrdiff_t)16*2+(ptrdiff_t)80*1);
__m512 sh17 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 sh18 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*1+(ptrdiff_t)80*1);
__m512 sh19 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*2+(ptrdiff_t)80*1);
__m512 add17 = _mm512_fnmadd_ps(me17, mul17, sh17);
__m512 add18 = _mm512_fnmadd_ps(me18, mul18, sh18);
__m512 add19 = _mm512_fnmadd_ps(me19, mul19, sh19);
__m512 lo17 = _mm512_permutex2var_ps(mul17, xlo4, add17);
__m512 lo18 = _mm512_permutex2var_ps(mul18, xlo4, add18);
__m512 lo19 = _mm512_permutex2var_ps(mul19, xlo4, add19);
__m512 hi16 = _mm512_permutex2var_ps(mul17, xhi3, add17);
__m512 hi17 = _mm512_permutex2var_ps(mul18, xhi3, add18);
__m512 hi18 = _mm512_permutex2var_ps(mul19, xhi3, add19);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*0+(ptrdiff_t)640*1, lo17);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*1+(ptrdiff_t)640*1, hi16);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*2+(ptrdiff_t)640*1, lo18);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*3+(ptrdiff_t)640*1, hi17);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*4+(ptrdiff_t)640*1, lo19);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*5+(ptrdiff_t)640*1, hi18);
}

static void ResNeXt50BnSimplify5(
float*restrict means5,
float*restrict variances5,
float*restrict scales5,
float*restrict shifts5,
char*restrict mas8
) {
__m512 eps5 = _mm512_set1_ps(2e-05f);
__m512i xlo5 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi4 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i30 = 0; i30 < 6; ++i30) {
__m512 va20 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*0+(ptrdiff_t)80*i30);
__m512 va21 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*1+(ptrdiff_t)80*i30);
__m512 va22 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*2+(ptrdiff_t)80*i30);
__m512 va23 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*3+(ptrdiff_t)80*i30);
__m512 va24 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*4+(ptrdiff_t)80*i30);
__m512 rcp20 = ResNeXt50Rsqrt1(_mm512_add_ps(eps5, va20));
__m512 rcp21 = ResNeXt50Rsqrt1(_mm512_add_ps(eps5, va21));
__m512 rcp22 = ResNeXt50Rsqrt1(_mm512_add_ps(eps5, va22));
__m512 rcp23 = ResNeXt50Rsqrt1(_mm512_add_ps(eps5, va23));
__m512 rcp24 = ResNeXt50Rsqrt1(_mm512_add_ps(eps5, va24));
__m512 sc20 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*0+(ptrdiff_t)80*i30);
__m512 sc21 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*1+(ptrdiff_t)80*i30);
__m512 sc22 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*2+(ptrdiff_t)80*i30);
__m512 sc23 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*3+(ptrdiff_t)80*i30);
__m512 sc24 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*4+(ptrdiff_t)80*i30);
__m512 mul20 = _mm512_mul_ps(rcp20, sc20);
__m512 mul21 = _mm512_mul_ps(rcp21, sc21);
__m512 mul22 = _mm512_mul_ps(rcp22, sc22);
__m512 mul23 = _mm512_mul_ps(rcp23, sc23);
__m512 mul24 = _mm512_mul_ps(rcp24, sc24);
__m512 me20 = _mm512_loadu_ps(means5+(ptrdiff_t)16*0+(ptrdiff_t)80*i30);
__m512 me21 = _mm512_loadu_ps(means5+(ptrdiff_t)16*1+(ptrdiff_t)80*i30);
__m512 me22 = _mm512_loadu_ps(means5+(ptrdiff_t)16*2+(ptrdiff_t)80*i30);
__m512 me23 = _mm512_loadu_ps(means5+(ptrdiff_t)16*3+(ptrdiff_t)80*i30);
__m512 me24 = _mm512_loadu_ps(means5+(ptrdiff_t)16*4+(ptrdiff_t)80*i30);
__m512 sh20 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*0+(ptrdiff_t)80*i30);
__m512 sh21 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*1+(ptrdiff_t)80*i30);
__m512 sh22 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*2+(ptrdiff_t)80*i30);
__m512 sh23 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*3+(ptrdiff_t)80*i30);
__m512 sh24 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*4+(ptrdiff_t)80*i30);
__m512 add20 = _mm512_fnmadd_ps(me20, mul20, sh20);
__m512 add21 = _mm512_fnmadd_ps(me21, mul21, sh21);
__m512 add22 = _mm512_fnmadd_ps(me22, mul22, sh22);
__m512 add23 = _mm512_fnmadd_ps(me23, mul23, sh23);
__m512 add24 = _mm512_fnmadd_ps(me24, mul24, sh24);
__m512 lo20 = _mm512_permutex2var_ps(mul20, xlo5, add20);
__m512 lo21 = _mm512_permutex2var_ps(mul21, xlo5, add21);
__m512 lo22 = _mm512_permutex2var_ps(mul22, xlo5, add22);
__m512 lo23 = _mm512_permutex2var_ps(mul23, xlo5, add23);
__m512 lo24 = _mm512_permutex2var_ps(mul24, xlo5, add24);
__m512 hi19 = _mm512_permutex2var_ps(mul20, xhi4, add20);
__m512 hi20 = _mm512_permutex2var_ps(mul21, xhi4, add21);
__m512 hi21 = _mm512_permutex2var_ps(mul22, xhi4, add22);
__m512 hi22 = _mm512_permutex2var_ps(mul23, xhi4, add23);
__m512 hi23 = _mm512_permutex2var_ps(mul24, xhi4, add24);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*0+(ptrdiff_t)640*i30, lo20);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*1+(ptrdiff_t)640*i30, hi19);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*2+(ptrdiff_t)640*i30, lo21);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*3+(ptrdiff_t)640*i30, hi20);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*4+(ptrdiff_t)640*i30, lo22);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*5+(ptrdiff_t)640*i30, hi21);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*6+(ptrdiff_t)640*i30, lo23);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*7+(ptrdiff_t)640*i30, hi22);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*8+(ptrdiff_t)640*i30, lo24);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*9+(ptrdiff_t)640*i30, hi23);
}
__m512 va25 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 va26 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 rcp25 = ResNeXt50Rsqrt1(_mm512_add_ps(eps5, va25));
__m512 rcp26 = ResNeXt50Rsqrt1(_mm512_add_ps(eps5, va26));
__m512 sc25 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 sc26 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 mul25 = _mm512_mul_ps(rcp25, sc25);
__m512 mul26 = _mm512_mul_ps(rcp26, sc26);
__m512 me25 = _mm512_loadu_ps(means5+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 me26 = _mm512_loadu_ps(means5+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 sh25 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 sh26 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 add25 = _mm512_fnmadd_ps(me25, mul25, sh25);
__m512 add26 = _mm512_fnmadd_ps(me26, mul26, sh26);
__m512 lo25 = _mm512_permutex2var_ps(mul25, xlo5, add25);
__m512 lo26 = _mm512_permutex2var_ps(mul26, xlo5, add26);
__m512 hi24 = _mm512_permutex2var_ps(mul25, xhi4, add25);
__m512 hi25 = _mm512_permutex2var_ps(mul26, xhi4, add26);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*0+(ptrdiff_t)640*6, lo25);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*1+(ptrdiff_t)640*6, hi24);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*2+(ptrdiff_t)640*6, lo26);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*3+(ptrdiff_t)640*6, hi25);
}

static void ResNeXt50BnSimplify6(
float*restrict means6,
float*restrict variances6,
float*restrict scales6,
float*restrict shifts6,
char*restrict mas11
) {
__m512 eps6 = _mm512_set1_ps(2e-05f);
__m512i xlo6 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi5 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i52 = 0; i52 < 12; ++i52) {
__m512 va27 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*0+(ptrdiff_t)80*i52);
__m512 va28 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*1+(ptrdiff_t)80*i52);
__m512 va29 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*2+(ptrdiff_t)80*i52);
__m512 va30 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*3+(ptrdiff_t)80*i52);
__m512 va31 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*4+(ptrdiff_t)80*i52);
__m512 rcp27 = ResNeXt50Rsqrt1(_mm512_add_ps(eps6, va27));
__m512 rcp28 = ResNeXt50Rsqrt1(_mm512_add_ps(eps6, va28));
__m512 rcp29 = ResNeXt50Rsqrt1(_mm512_add_ps(eps6, va29));
__m512 rcp30 = ResNeXt50Rsqrt1(_mm512_add_ps(eps6, va30));
__m512 rcp31 = ResNeXt50Rsqrt1(_mm512_add_ps(eps6, va31));
__m512 sc27 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*0+(ptrdiff_t)80*i52);
__m512 sc28 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*1+(ptrdiff_t)80*i52);
__m512 sc29 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*2+(ptrdiff_t)80*i52);
__m512 sc30 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*3+(ptrdiff_t)80*i52);
__m512 sc31 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*4+(ptrdiff_t)80*i52);
__m512 mul27 = _mm512_mul_ps(rcp27, sc27);
__m512 mul28 = _mm512_mul_ps(rcp28, sc28);
__m512 mul29 = _mm512_mul_ps(rcp29, sc29);
__m512 mul30 = _mm512_mul_ps(rcp30, sc30);
__m512 mul31 = _mm512_mul_ps(rcp31, sc31);
__m512 me27 = _mm512_loadu_ps(means6+(ptrdiff_t)16*0+(ptrdiff_t)80*i52);
__m512 me28 = _mm512_loadu_ps(means6+(ptrdiff_t)16*1+(ptrdiff_t)80*i52);
__m512 me29 = _mm512_loadu_ps(means6+(ptrdiff_t)16*2+(ptrdiff_t)80*i52);
__m512 me30 = _mm512_loadu_ps(means6+(ptrdiff_t)16*3+(ptrdiff_t)80*i52);
__m512 me31 = _mm512_loadu_ps(means6+(ptrdiff_t)16*4+(ptrdiff_t)80*i52);
__m512 sh27 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*0+(ptrdiff_t)80*i52);
__m512 sh28 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*1+(ptrdiff_t)80*i52);
__m512 sh29 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*2+(ptrdiff_t)80*i52);
__m512 sh30 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*3+(ptrdiff_t)80*i52);
__m512 sh31 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*4+(ptrdiff_t)80*i52);
__m512 add27 = _mm512_fnmadd_ps(me27, mul27, sh27);
__m512 add28 = _mm512_fnmadd_ps(me28, mul28, sh28);
__m512 add29 = _mm512_fnmadd_ps(me29, mul29, sh29);
__m512 add30 = _mm512_fnmadd_ps(me30, mul30, sh30);
__m512 add31 = _mm512_fnmadd_ps(me31, mul31, sh31);
__m512 lo27 = _mm512_permutex2var_ps(mul27, xlo6, add27);
__m512 lo28 = _mm512_permutex2var_ps(mul28, xlo6, add28);
__m512 lo29 = _mm512_permutex2var_ps(mul29, xlo6, add29);
__m512 lo30 = _mm512_permutex2var_ps(mul30, xlo6, add30);
__m512 lo31 = _mm512_permutex2var_ps(mul31, xlo6, add31);
__m512 hi26 = _mm512_permutex2var_ps(mul27, xhi5, add27);
__m512 hi27 = _mm512_permutex2var_ps(mul28, xhi5, add28);
__m512 hi28 = _mm512_permutex2var_ps(mul29, xhi5, add29);
__m512 hi29 = _mm512_permutex2var_ps(mul30, xhi5, add30);
__m512 hi30 = _mm512_permutex2var_ps(mul31, xhi5, add31);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*0+(ptrdiff_t)640*i52, lo27);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*1+(ptrdiff_t)640*i52, hi26);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*2+(ptrdiff_t)640*i52, lo28);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*3+(ptrdiff_t)640*i52, hi27);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*4+(ptrdiff_t)640*i52, lo29);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*5+(ptrdiff_t)640*i52, hi28);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*6+(ptrdiff_t)640*i52, lo30);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*7+(ptrdiff_t)640*i52, hi29);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*8+(ptrdiff_t)640*i52, lo31);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*9+(ptrdiff_t)640*i52, hi30);
}
__m512 va32 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 va33 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 va34 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*2+(ptrdiff_t)80*12);
__m512 va35 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*3+(ptrdiff_t)80*12);
__m512 rcp32 = ResNeXt50Rsqrt1(_mm512_add_ps(eps6, va32));
__m512 rcp33 = ResNeXt50Rsqrt1(_mm512_add_ps(eps6, va33));
__m512 rcp34 = ResNeXt50Rsqrt1(_mm512_add_ps(eps6, va34));
__m512 rcp35 = ResNeXt50Rsqrt1(_mm512_add_ps(eps6, va35));
__m512 sc32 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 sc33 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 sc34 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*2+(ptrdiff_t)80*12);
__m512 sc35 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*3+(ptrdiff_t)80*12);
__m512 mul32 = _mm512_mul_ps(rcp32, sc32);
__m512 mul33 = _mm512_mul_ps(rcp33, sc33);
__m512 mul34 = _mm512_mul_ps(rcp34, sc34);
__m512 mul35 = _mm512_mul_ps(rcp35, sc35);
__m512 me32 = _mm512_loadu_ps(means6+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 me33 = _mm512_loadu_ps(means6+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 me34 = _mm512_loadu_ps(means6+(ptrdiff_t)16*2+(ptrdiff_t)80*12);
__m512 me35 = _mm512_loadu_ps(means6+(ptrdiff_t)16*3+(ptrdiff_t)80*12);
__m512 sh32 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 sh33 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 sh34 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*2+(ptrdiff_t)80*12);
__m512 sh35 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*3+(ptrdiff_t)80*12);
__m512 add32 = _mm512_fnmadd_ps(me32, mul32, sh32);
__m512 add33 = _mm512_fnmadd_ps(me33, mul33, sh33);
__m512 add34 = _mm512_fnmadd_ps(me34, mul34, sh34);
__m512 add35 = _mm512_fnmadd_ps(me35, mul35, sh35);
__m512 lo32 = _mm512_permutex2var_ps(mul32, xlo6, add32);
__m512 lo33 = _mm512_permutex2var_ps(mul33, xlo6, add33);
__m512 lo34 = _mm512_permutex2var_ps(mul34, xlo6, add34);
__m512 lo35 = _mm512_permutex2var_ps(mul35, xlo6, add35);
__m512 hi31 = _mm512_permutex2var_ps(mul32, xhi5, add32);
__m512 hi32 = _mm512_permutex2var_ps(mul33, xhi5, add33);
__m512 hi33 = _mm512_permutex2var_ps(mul34, xhi5, add34);
__m512 hi34 = _mm512_permutex2var_ps(mul35, xhi5, add35);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*0+(ptrdiff_t)640*12, lo32);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*1+(ptrdiff_t)640*12, hi31);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*2+(ptrdiff_t)640*12, lo33);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*3+(ptrdiff_t)640*12, hi32);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*4+(ptrdiff_t)640*12, lo34);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*5+(ptrdiff_t)640*12, hi33);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*6+(ptrdiff_t)640*12, lo35);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*7+(ptrdiff_t)640*12, hi34);
}

static void ResNeXt50BnSimplify7(
float*restrict means7,
float*restrict variances7,
float*restrict scales7,
float*restrict shifts7,
char*restrict mas14
) {
__m512 eps7 = _mm512_set1_ps(2e-05f);
__m512i xlo7 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi6 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i74 = 0; i74 < 25; ++i74) {
__m512 va36 = _mm512_loadu_ps(variances7+(ptrdiff_t)16*0+(ptrdiff_t)80*i74);
__m512 va37 = _mm512_loadu_ps(variances7+(ptrdiff_t)16*1+(ptrdiff_t)80*i74);
__m512 va38 = _mm512_loadu_ps(variances7+(ptrdiff_t)16*2+(ptrdiff_t)80*i74);
__m512 va39 = _mm512_loadu_ps(variances7+(ptrdiff_t)16*3+(ptrdiff_t)80*i74);
__m512 va40 = _mm512_loadu_ps(variances7+(ptrdiff_t)16*4+(ptrdiff_t)80*i74);
__m512 rcp36 = ResNeXt50Rsqrt1(_mm512_add_ps(eps7, va36));
__m512 rcp37 = ResNeXt50Rsqrt1(_mm512_add_ps(eps7, va37));
__m512 rcp38 = ResNeXt50Rsqrt1(_mm512_add_ps(eps7, va38));
__m512 rcp39 = ResNeXt50Rsqrt1(_mm512_add_ps(eps7, va39));
__m512 rcp40 = ResNeXt50Rsqrt1(_mm512_add_ps(eps7, va40));
__m512 sc36 = _mm512_loadu_ps(scales7+(ptrdiff_t)16*0+(ptrdiff_t)80*i74);
__m512 sc37 = _mm512_loadu_ps(scales7+(ptrdiff_t)16*1+(ptrdiff_t)80*i74);
__m512 sc38 = _mm512_loadu_ps(scales7+(ptrdiff_t)16*2+(ptrdiff_t)80*i74);
__m512 sc39 = _mm512_loadu_ps(scales7+(ptrdiff_t)16*3+(ptrdiff_t)80*i74);
__m512 sc40 = _mm512_loadu_ps(scales7+(ptrdiff_t)16*4+(ptrdiff_t)80*i74);
__m512 mul36 = _mm512_mul_ps(rcp36, sc36);
__m512 mul37 = _mm512_mul_ps(rcp37, sc37);
__m512 mul38 = _mm512_mul_ps(rcp38, sc38);
__m512 mul39 = _mm512_mul_ps(rcp39, sc39);
__m512 mul40 = _mm512_mul_ps(rcp40, sc40);
__m512 me36 = _mm512_loadu_ps(means7+(ptrdiff_t)16*0+(ptrdiff_t)80*i74);
__m512 me37 = _mm512_loadu_ps(means7+(ptrdiff_t)16*1+(ptrdiff_t)80*i74);
__m512 me38 = _mm512_loadu_ps(means7+(ptrdiff_t)16*2+(ptrdiff_t)80*i74);
__m512 me39 = _mm512_loadu_ps(means7+(ptrdiff_t)16*3+(ptrdiff_t)80*i74);
__m512 me40 = _mm512_loadu_ps(means7+(ptrdiff_t)16*4+(ptrdiff_t)80*i74);
__m512 sh36 = _mm512_loadu_ps(shifts7+(ptrdiff_t)16*0+(ptrdiff_t)80*i74);
__m512 sh37 = _mm512_loadu_ps(shifts7+(ptrdiff_t)16*1+(ptrdiff_t)80*i74);
__m512 sh38 = _mm512_loadu_ps(shifts7+(ptrdiff_t)16*2+(ptrdiff_t)80*i74);
__m512 sh39 = _mm512_loadu_ps(shifts7+(ptrdiff_t)16*3+(ptrdiff_t)80*i74);
__m512 sh40 = _mm512_loadu_ps(shifts7+(ptrdiff_t)16*4+(ptrdiff_t)80*i74);
__m512 add36 = _mm512_fnmadd_ps(me36, mul36, sh36);
__m512 add37 = _mm512_fnmadd_ps(me37, mul37, sh37);
__m512 add38 = _mm512_fnmadd_ps(me38, mul38, sh38);
__m512 add39 = _mm512_fnmadd_ps(me39, mul39, sh39);
__m512 add40 = _mm512_fnmadd_ps(me40, mul40, sh40);
__m512 lo36 = _mm512_permutex2var_ps(mul36, xlo7, add36);
__m512 lo37 = _mm512_permutex2var_ps(mul37, xlo7, add37);
__m512 lo38 = _mm512_permutex2var_ps(mul38, xlo7, add38);
__m512 lo39 = _mm512_permutex2var_ps(mul39, xlo7, add39);
__m512 lo40 = _mm512_permutex2var_ps(mul40, xlo7, add40);
__m512 hi35 = _mm512_permutex2var_ps(mul36, xhi6, add36);
__m512 hi36 = _mm512_permutex2var_ps(mul37, xhi6, add37);
__m512 hi37 = _mm512_permutex2var_ps(mul38, xhi6, add38);
__m512 hi38 = _mm512_permutex2var_ps(mul39, xhi6, add39);
__m512 hi39 = _mm512_permutex2var_ps(mul40, xhi6, add40);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*0+(ptrdiff_t)640*i74, lo36);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*1+(ptrdiff_t)640*i74, hi35);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*2+(ptrdiff_t)640*i74, lo37);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*3+(ptrdiff_t)640*i74, hi36);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*4+(ptrdiff_t)640*i74, lo38);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*5+(ptrdiff_t)640*i74, hi37);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*6+(ptrdiff_t)640*i74, lo39);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*7+(ptrdiff_t)640*i74, hi38);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*8+(ptrdiff_t)640*i74, lo40);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*9+(ptrdiff_t)640*i74, hi39);
}
__m512 va41 = _mm512_loadu_ps(variances7+(ptrdiff_t)16*0+(ptrdiff_t)80*25);
__m512 va42 = _mm512_loadu_ps(variances7+(ptrdiff_t)16*1+(ptrdiff_t)80*25);
__m512 va43 = _mm512_loadu_ps(variances7+(ptrdiff_t)16*2+(ptrdiff_t)80*25);
__m512 rcp41 = ResNeXt50Rsqrt1(_mm512_add_ps(eps7, va41));
__m512 rcp42 = ResNeXt50Rsqrt1(_mm512_add_ps(eps7, va42));
__m512 rcp43 = ResNeXt50Rsqrt1(_mm512_add_ps(eps7, va43));
__m512 sc41 = _mm512_loadu_ps(scales7+(ptrdiff_t)16*0+(ptrdiff_t)80*25);
__m512 sc42 = _mm512_loadu_ps(scales7+(ptrdiff_t)16*1+(ptrdiff_t)80*25);
__m512 sc43 = _mm512_loadu_ps(scales7+(ptrdiff_t)16*2+(ptrdiff_t)80*25);
__m512 mul41 = _mm512_mul_ps(rcp41, sc41);
__m512 mul42 = _mm512_mul_ps(rcp42, sc42);
__m512 mul43 = _mm512_mul_ps(rcp43, sc43);
__m512 me41 = _mm512_loadu_ps(means7+(ptrdiff_t)16*0+(ptrdiff_t)80*25);
__m512 me42 = _mm512_loadu_ps(means7+(ptrdiff_t)16*1+(ptrdiff_t)80*25);
__m512 me43 = _mm512_loadu_ps(means7+(ptrdiff_t)16*2+(ptrdiff_t)80*25);
__m512 sh41 = _mm512_loadu_ps(shifts7+(ptrdiff_t)16*0+(ptrdiff_t)80*25);
__m512 sh42 = _mm512_loadu_ps(shifts7+(ptrdiff_t)16*1+(ptrdiff_t)80*25);
__m512 sh43 = _mm512_loadu_ps(shifts7+(ptrdiff_t)16*2+(ptrdiff_t)80*25);
__m512 add41 = _mm512_fnmadd_ps(me41, mul41, sh41);
__m512 add42 = _mm512_fnmadd_ps(me42, mul42, sh42);
__m512 add43 = _mm512_fnmadd_ps(me43, mul43, sh43);
__m512 lo41 = _mm512_permutex2var_ps(mul41, xlo7, add41);
__m512 lo42 = _mm512_permutex2var_ps(mul42, xlo7, add42);
__m512 lo43 = _mm512_permutex2var_ps(mul43, xlo7, add43);
__m512 hi40 = _mm512_permutex2var_ps(mul41, xhi6, add41);
__m512 hi41 = _mm512_permutex2var_ps(mul42, xhi6, add42);
__m512 hi42 = _mm512_permutex2var_ps(mul43, xhi6, add43);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*0+(ptrdiff_t)640*25, lo41);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*1+(ptrdiff_t)640*25, hi40);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*2+(ptrdiff_t)640*25, lo42);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*3+(ptrdiff_t)640*25, hi41);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*4+(ptrdiff_t)640*25, lo43);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*5+(ptrdiff_t)640*25, hi42);
}

static void ResNeXt50Glopl1Callee1(ResNeXt50ThreaderTask1* task172, int64_t* pt91) {
char** tensors168 = task172->any1;
ptrdiff_t c80 = pt91[0];
char*restrict ptr3 = tensors168[0]+(ptrdiff_t)40960*c80;
char*restrict ptr4 = tensors168[1]+(ptrdiff_t)512*c80;
__m512 buf1 = _mm512_setzero_ps();
__mmask16 mask3 = 65535;
for (ptrdiff_t i100 = 0; i100 < 64; ++i100) {
__m512 acc1 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)0+(ptrdiff_t)640*i100);
__m512 acc2 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)64+(ptrdiff_t)640*i100);
__m512 acc3 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)128+(ptrdiff_t)640*i100);
__m512 acc4 = _mm512_maskz_loadu_ps(1, ptr3+(ptrdiff_t)192+(ptrdiff_t)640*i100);
__m512 acc5 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)320+(ptrdiff_t)640*i100);
__m512 acc6 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)384+(ptrdiff_t)640*i100);
__m512 acc7 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)448+(ptrdiff_t)640*i100);
__m512 acc8 = _mm512_maskz_loadu_ps(1, ptr3+(ptrdiff_t)512+(ptrdiff_t)640*i100);
acc1 = _mm512_mask_add_ps(acc1, 65535, acc1, acc3);
acc5 = _mm512_mask_add_ps(acc5, 65535, acc5, acc7);
acc2 = _mm512_mask_add_ps(acc2, 1, acc2, acc4);
acc6 = _mm512_mask_add_ps(acc6, 1, acc6, acc8);
acc1 = _mm512_mask_add_ps(acc1, 65535, acc1, acc2);
acc5 = _mm512_mask_add_ps(acc5, 65535, acc5, acc6);
__m512i pm1lo1 = _mm512_set_epi32(16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0);
__m512i pm1hi1 = _mm512_set_epi32(17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1);
__m512 hi43 = _mm512_shuffle_f32x4(acc1, acc1, 238);
__m512 hi46 = _mm512_shuffle_f32x4(acc5, acc5, 238);
acc1 = _mm512_mask_add_ps(acc1, 255, acc1, hi43);
acc5 = _mm512_mask_add_ps(acc5, 255, acc5, hi46);
__m512 hi44 = _mm512_shuffle_f32x4(acc1, acc1, 1);
__m512 hi47 = _mm512_shuffle_f32x4(acc5, acc5, 1);
acc1 = _mm512_mask_add_ps(acc1, 15, acc1, hi44);
acc5 = _mm512_mask_add_ps(acc5, 15, acc5, hi47);
__m512 hi45 = _mm512_shuffle_ps(acc1, acc1, 238);
__m512 hi48 = _mm512_shuffle_ps(acc5, acc5, 238);
acc1 = _mm512_mask_add_ps(acc1, 3, acc1, hi45);
acc5 = _mm512_mask_add_ps(acc5, 3, acc5, hi48);
__m512 hi49 = _mm512_permutex2var_ps(acc1, pm1hi1, acc5);
acc1 = _mm512_permutex2var_ps(acc1, pm1lo1, acc5);
acc1 = _mm512_mask_add_ps(acc1, 65535, acc1, hi49);
buf1 = _mm512_mask_mov_ps(buf1, mask3, acc1);
mask3 &= mask3<<2;
if (__builtin_expect(!mask3, 0)) {
mask3 = 65535;
buf1 = _mm512_mul_ps(buf1, _mm512_set1_ps(2.0408163e-02f));
_mm512_mask_storeu_ps(ptr4+(ptrdiff_t)4*((ptrdiff_t)2*i100-14), 65535, buf1);
}
}
}

static void ResNeXt50Glopl1(ResNeXt50ThreaderTeam1* team96, char** tensors167) {
ResNeXt50ThreaderTask1 task173;
task173.callee1 = ResNeXt50Glopl1Callee1;
task173.any1 = tensors167;
task173.nd1 = 1;
task173.hull1[0] = 16;
ResNeXt50ThreaderDo1(team96, &task173);
}

static void ResNeXt50Thrpl1Callee1(ResNeXt50ThreaderTask1* task12, int64_t* pt11) {
char** tensors10 = task12->any1;
ptrdiff_t b43 = pt11[0];
ptrdiff_t e5 = pt11[1];
ptrdiff_t c4 = pt11[2];
char*restrict ptr1 = tensors10[0]-(ptrdiff_t)448+(ptrdiff_t)50176*b43+(ptrdiff_t)448*e5+(ptrdiff_t)50240*c4;
char*restrict ptr2 = tensors10[1]+(ptrdiff_t)12544*b43+(ptrdiff_t)224*e5+(ptrdiff_t)12608*c4;
for (ptrdiff_t i10 = 0; i10 < 1; ++i10) {
__m512 in1 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*0);
__m512 in2 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)512+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*0);
__m512 dat894 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*0);
__m512 dat895 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)960+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*0);
in1 = _mm512_max_ps(in1, dat894);
in2 = _mm512_max_ps(in2, dat895);
__m512i pm57 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pm58 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512i pm59 = _mm512_set_epi32(29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, 31);
__m512 out1 = _mm512_permutex2var_ps(in1, pm57, in2);
__m512 pack263 = _mm512_permutex2var_ps(in1, pm58, in2);
__m512 pack264 = _mm512_permutex2var_ps(in1, pm59, in2);
out1 = _mm512_mask_max_ps(out1, 65535, out1, pack263);
out1 = _mm512_mask_max_ps(out1, 65534, out1, pack264);
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*0+(ptrdiff_t)64*0, 65535, out1);
for (ptrdiff_t k44 = 1; k44 < 3; ++k44) {
__m512 in3 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*k44);
__m512 in4 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)512+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*k44);
__m512 dat896 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*k44);
__m512 dat897 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)960+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*k44);
in3 = _mm512_max_ps(in3, dat896);
in4 = _mm512_max_ps(in4, dat897);
__m512 blend1 = _mm512_mask_mov_ps(in4, 32768, in2);
__m512 out2 = _mm512_permutex2var_ps(in3, pm57, in4);
__m512 pack265 = _mm512_permutex2var_ps(in3, pm58, in4);
__m512 pack266 = _mm512_permutex2var_ps(in3, pm59, blend1);
out2 = _mm512_mask_max_ps(out2, 65535, out2, pack265);
out2 = _mm512_mask_max_ps(out2, 65535, out2, pack266);
in2 = in4;
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*0+(ptrdiff_t)64*k44, 65535, out2);
}
__m512 in5 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*3);
__m512 dat898 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*3);
in5 = _mm512_max_ps(in5, dat898);
__m512 blend2 = _mm512_mask_mov_ps(in5, 32768, in2);
__m512 out3 = _mm512_permutexvar_ps(pm57, in5);
__m512 pack267 = _mm512_permutexvar_ps(pm58, in5);
__m512 pack268 = _mm512_permutexvar_ps(pm59, blend2);
out3 = _mm512_mask_max_ps(out3, 255, out3, pack267);
out3 = _mm512_mask_max_ps(out3, 255, out3, pack268);
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*0+(ptrdiff_t)64*3, 255, out3);
for (ptrdiff_t j6 = 1; j6 < 56; ++j6) {
__m512 in6 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)0+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 in7 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)64+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 dat899 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 dat901 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)512+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 dat900 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 dat902 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)960+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
in6 = _mm512_max_ps(in6, dat899);
in7 = _mm512_max_ps(in7, dat901);
in6 = _mm512_max_ps(in6, dat900);
in7 = _mm512_max_ps(in7, dat902);
__m512i pm60 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pm61 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512i pm62 = _mm512_set_epi32(29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, 31);
__m512 out4 = _mm512_permutex2var_ps(in6, pm60, in7);
__m512 pack269 = _mm512_permutex2var_ps(in6, pm61, in7);
__m512 pack270 = _mm512_permutex2var_ps(in6, pm62, in7);
out4 = _mm512_mask_max_ps(out4, 65535, out4, pack269);
out4 = _mm512_mask_max_ps(out4, 65534, out4, pack270);
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*j6+(ptrdiff_t)64*0, 65535, out4);
for (ptrdiff_t k45 = 1; k45 < 3; ++k45) {
__m512 in8 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)0+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 in9 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)64+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 dat903 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 dat905 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)512+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 dat904 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 dat906 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)960+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
in8 = _mm512_max_ps(in8, dat903);
in9 = _mm512_max_ps(in9, dat905);
in8 = _mm512_max_ps(in8, dat904);
in9 = _mm512_max_ps(in9, dat906);
__m512 blend3 = _mm512_mask_mov_ps(in9, 32768, in7);
__m512 out5 = _mm512_permutex2var_ps(in8, pm60, in9);
__m512 pack271 = _mm512_permutex2var_ps(in8, pm61, in9);
__m512 pack272 = _mm512_permutex2var_ps(in8, pm62, blend3);
out5 = _mm512_mask_max_ps(out5, 65535, out5, pack271);
out5 = _mm512_mask_max_ps(out5, 65535, out5, pack272);
in7 = in9;
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*j6+(ptrdiff_t)64*k45, 65535, out5);
}
__m512 in10 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)0+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*3);
__m512 dat907 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*3);
__m512 dat908 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*3);
in10 = _mm512_max_ps(in10, dat907);
in10 = _mm512_max_ps(in10, dat908);
__m512 blend4 = _mm512_mask_mov_ps(in10, 32768, in7);
__m512 out6 = _mm512_permutexvar_ps(pm60, in10);
__m512 pack273 = _mm512_permutexvar_ps(pm61, in10);
__m512 pack274 = _mm512_permutexvar_ps(pm62, blend4);
out6 = _mm512_mask_max_ps(out6, 255, out6, pack273);
out6 = _mm512_mask_max_ps(out6, 255, out6, pack274);
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*j6+(ptrdiff_t)64*3, 255, out6);
}
}
}

static void ResNeXt50Thrpl1(ResNeXt50ThreaderTeam1* team18, char** tensors9) {
ResNeXt50ThreaderTask1 task13;
task13.callee1 = ResNeXt50Thrpl1Callee1;
task13.any1 = tensors9;
task13.nd1 = 3;
task13.hull1[0] = 1;
task13.hull1[1] = 1;
task13.hull1[2] = 64;
ResNeXt50ThreaderDo1(team18, &task13);
}

static void ResNeXt50FcArrange1Callee1(ResNeXt50ThreaderTask1* task174, int64_t* pt92) {
char** tensors170 = task174->any1;
ptrdiff_t t46 = pt92[0];
char*restrict weights1 = tensors170[0]+(ptrdiff_t)131072*t46;
char*restrict biases1 = tensors170[1]+(ptrdiff_t)64*t46;
char*restrict weights2 = tensors170[2]+(ptrdiff_t)65536*t46;
char*restrict biases2 = tensors170[2]+(ptrdiff_t)4096000+(ptrdiff_t)64*t46;
if (t46 < 62) {
for (ptrdiff_t i101 = 0; i101 < 1; ++i101) {
for (ptrdiff_t j91 = 0; j91 < 128; ++j91) {
__m512 wtLo1 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)0+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtHi1 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)8192+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtLo2 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)16384+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtHi2 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)24576+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtLo3 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)32768+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtHi3 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)40960+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtLo4 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)49152+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtHi4 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)57344+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtLo5 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)65536+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtHi5 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)73728+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtLo6 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)81920+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtHi6 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)90112+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtLo7 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)98304+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtHi7 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)106496+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtLo8 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)114688+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtHi8 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)122880+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m256i halfLo1 = _mm512_cvtps_ph(wtLo1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi1 = _mm512_cvtps_ph(wtHi1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo2 = _mm512_cvtps_ph(wtLo2, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi2 = _mm512_cvtps_ph(wtHi2, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo3 = _mm512_cvtps_ph(wtLo3, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi3 = _mm512_cvtps_ph(wtHi3, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo4 = _mm512_cvtps_ph(wtLo4, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi4 = _mm512_cvtps_ph(wtHi4, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo5 = _mm512_cvtps_ph(wtLo5, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi5 = _mm512_cvtps_ph(wtHi5, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo6 = _mm512_cvtps_ph(wtLo6, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi6 = _mm512_cvtps_ph(wtHi6, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo7 = _mm512_cvtps_ph(wtLo7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi7 = _mm512_cvtps_ph(wtHi7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo8 = _mm512_cvtps_ph(wtLo8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi8 = _mm512_cvtps_ph(wtHi8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512i yield1 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo1), halfHi1, 1);
__m512i yield2 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo2), halfHi2, 1);
__m512i yield3 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo3), halfHi3, 1);
__m512i yield4 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo4), halfHi4, 1);
__m512i yield5 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo5), halfHi5, 1);
__m512i yield6 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo6), halfHi6, 1);
__m512i yield7 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo7), halfHi7, 1);
__m512i yield8 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo8), halfHi8, 1);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)0+(ptrdiff_t)65536*i101+(ptrdiff_t)512*j91, 65535, yield1);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)64+(ptrdiff_t)65536*i101+(ptrdiff_t)512*j91, 65535, yield2);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)128+(ptrdiff_t)65536*i101+(ptrdiff_t)512*j91, 65535, yield3);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)192+(ptrdiff_t)65536*i101+(ptrdiff_t)512*j91, 65535, yield4);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)256+(ptrdiff_t)65536*i101+(ptrdiff_t)512*j91, 65535, yield5);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)320+(ptrdiff_t)65536*i101+(ptrdiff_t)512*j91, 65535, yield6);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)384+(ptrdiff_t)65536*i101+(ptrdiff_t)512*j91, 65535, yield7);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)448+(ptrdiff_t)65536*i101+(ptrdiff_t)512*j91, 65535, yield8);
}
__m512 bias10 = _mm512_maskz_loadu_ps(65535, biases1+(ptrdiff_t)0+(ptrdiff_t)64*i101);
_mm512_mask_storeu_ps(biases2+(ptrdiff_t)0+(ptrdiff_t)64*i101, 65535, bias10);
}
return;
}
for (ptrdiff_t i102 = 0; i102 < 1; ++i102) {
for (ptrdiff_t j92 = 0; j92 < 64; ++j92) {
__m512 wtLo9 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)0+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtHi9 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)8192+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtLo10 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)16384+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtHi10 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)24576+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtLo11 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)32768+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtHi11 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)40960+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtLo12 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)49152+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtHi12 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)57344+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtLo13 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)64+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtHi13 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)8256+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtLo14 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)16448+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtHi14 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)24640+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtLo15 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)32832+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtHi15 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)41024+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtLo16 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)49216+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtHi16 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)57408+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m256i halfLo9 = _mm512_cvtps_ph(wtLo9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi9 = _mm512_cvtps_ph(wtHi9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo10 = _mm512_cvtps_ph(wtLo10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi10 = _mm512_cvtps_ph(wtHi10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo11 = _mm512_cvtps_ph(wtLo11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi11 = _mm512_cvtps_ph(wtHi11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo12 = _mm512_cvtps_ph(wtLo12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi12 = _mm512_cvtps_ph(wtHi12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo13 = _mm512_cvtps_ph(wtLo13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi13 = _mm512_cvtps_ph(wtHi13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo14 = _mm512_cvtps_ph(wtLo14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi14 = _mm512_cvtps_ph(wtHi14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo15 = _mm512_cvtps_ph(wtLo15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi15 = _mm512_cvtps_ph(wtHi15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo16 = _mm512_cvtps_ph(wtLo16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi16 = _mm512_cvtps_ph(wtHi16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512i yield9 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo9), halfHi9, 1);
__m512i yield10 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo10), halfHi10, 1);
__m512i yield11 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo11), halfHi11, 1);
__m512i yield12 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo12), halfHi12, 1);
__m512i yield13 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo13), halfHi13, 1);
__m512i yield14 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo14), halfHi14, 1);
__m512i yield15 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo15), halfHi15, 1);
__m512i yield16 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo16), halfHi16, 1);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)0+(ptrdiff_t)65536*i102+(ptrdiff_t)512*j92, 65535, yield9);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)64+(ptrdiff_t)65536*i102+(ptrdiff_t)512*j92, 65535, yield10);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)128+(ptrdiff_t)65536*i102+(ptrdiff_t)512*j92, 65535, yield11);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)192+(ptrdiff_t)65536*i102+(ptrdiff_t)512*j92, 65535, yield12);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)256+(ptrdiff_t)65536*i102+(ptrdiff_t)512*j92, 65535, yield13);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)320+(ptrdiff_t)65536*i102+(ptrdiff_t)512*j92, 65535, yield14);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)384+(ptrdiff_t)65536*i102+(ptrdiff_t)512*j92, 65535, yield15);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)448+(ptrdiff_t)65536*i102+(ptrdiff_t)512*j92, 65535, yield16);
}
__m512 bias11 = _mm512_maskz_loadu_ps(255, biases1+(ptrdiff_t)0+(ptrdiff_t)32*i102);
_mm512_mask_storeu_ps(biases2+(ptrdiff_t)0+(ptrdiff_t)32*i102, 255, bias11);
}
}

static void ResNeXt50FcArrange1(ResNeXt50ThreaderTeam1* team97, char** tensors169) {
ResNeXt50ThreaderTask1 task175;
task175.callee1 = ResNeXt50FcArrange1Callee1;
task175.any1 = tensors169;
task175.nd1 = 1;
task175.hull1[0] = 63;
ResNeXt50ThreaderDo1(team97, &task175);
}

static void ResNeXt50FcApply1Callee1(ResNeXt50ThreaderTask1* task176, int64_t* pt93) {
char** tensors172 = task176->any1;
ptrdiff_t t47 = pt93[0];
char*restrict wtPtr27 = tensors172[0]+(ptrdiff_t)65536*t47;
char*restrict biasPtr26 = tensors172[0]+(ptrdiff_t)4096000+(ptrdiff_t)64*t47;
char*restrict datPtr56 = tensors172[1];
char*restrict datPtr57 = tensors172[2]+(ptrdiff_t)64*t47;
if (t47 < 62) {
for (ptrdiff_t i103 = 0; i103 < 1; ++i103) {
__m512 sum893 = _mm512_setzero_ps();
__m512 sum894 = _mm512_setzero_ps();
__m512 sum895 = _mm512_setzero_ps();
__m512 sum896 = _mm512_setzero_ps();
__m512 sum897 = _mm512_setzero_ps();
__m512 sum898 = _mm512_setzero_ps();
__m512 sum899 = _mm512_setzero_ps();
__m512 sum900 = _mm512_setzero_ps();
__m512 sum901 = _mm512_setzero_ps();
__m512 sum902 = _mm512_setzero_ps();
__m512 sum903 = _mm512_setzero_ps();
__m512 sum904 = _mm512_setzero_ps();
__m512 sum905 = _mm512_setzero_ps();
__m512 sum906 = _mm512_setzero_ps();
__m512 sum907 = _mm512_setzero_ps();
__m512 sum908 = _mm512_setzero_ps();
for (ptrdiff_t j93 = 0; j93 < 128; ++j93) {
__m512i wts1 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)0+(ptrdiff_t)65536*i103+(ptrdiff_t)512*j93);
__m512 dat2712 = _mm512_maskz_loadu_ps(65535, datPtr56+(ptrdiff_t)0+(ptrdiff_t)64*j93);
__m512i wts2 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)64+(ptrdiff_t)65536*i103+(ptrdiff_t)512*j93);
__m512i wts3 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)128+(ptrdiff_t)65536*i103+(ptrdiff_t)512*j93);
__m512i wts4 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)192+(ptrdiff_t)65536*i103+(ptrdiff_t)512*j93);
__m512 wtLo17 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts1));
__m512 wtHi17 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts1, 1));
__m512 wtLo18 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts2));
__m512 wtHi18 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts2, 1));
__m512 wtLo19 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts3));
__m512 wtHi19 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts3, 1));
__m512 wtLo20 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts4));
__m512 wtHi20 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts4, 1));
sum893 = _mm512_fmadd_ps(wtLo17, dat2712, sum893);
sum894 = _mm512_fmadd_ps(wtHi17, dat2712, sum894);
sum895 = _mm512_fmadd_ps(wtLo18, dat2712, sum895);
sum896 = _mm512_fmadd_ps(wtHi18, dat2712, sum896);
sum897 = _mm512_fmadd_ps(wtLo19, dat2712, sum897);
sum898 = _mm512_fmadd_ps(wtHi19, dat2712, sum898);
sum899 = _mm512_fmadd_ps(wtLo20, dat2712, sum899);
sum900 = _mm512_fmadd_ps(wtHi20, dat2712, sum900);
__m512i wts5 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)256+(ptrdiff_t)65536*i103+(ptrdiff_t)512*j93);
__m512i wts6 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)320+(ptrdiff_t)65536*i103+(ptrdiff_t)512*j93);
__m512i wts7 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)384+(ptrdiff_t)65536*i103+(ptrdiff_t)512*j93);
__m512i wts8 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)448+(ptrdiff_t)65536*i103+(ptrdiff_t)512*j93);
__m512 wtLo21 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts5));
__m512 wtHi21 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts5, 1));
__m512 wtLo22 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts6));
__m512 wtHi22 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts6, 1));
__m512 wtLo23 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts7));
__m512 wtHi23 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts7, 1));
__m512 wtLo24 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts8));
__m512 wtHi24 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts8, 1));
sum901 = _mm512_fmadd_ps(wtLo21, dat2712, sum901);
sum902 = _mm512_fmadd_ps(wtHi21, dat2712, sum902);
sum903 = _mm512_fmadd_ps(wtLo22, dat2712, sum903);
sum904 = _mm512_fmadd_ps(wtHi22, dat2712, sum904);
sum905 = _mm512_fmadd_ps(wtLo23, dat2712, sum905);
sum906 = _mm512_fmadd_ps(wtHi23, dat2712, sum906);
sum907 = _mm512_fmadd_ps(wtLo24, dat2712, sum907);
sum908 = _mm512_fmadd_ps(wtHi24, dat2712, sum908);
}
__m512 bias12 = _mm512_maskz_loadu_ps(65535, biasPtr26+(ptrdiff_t)0+(ptrdiff_t)64*i103);
__m512i pm1Lo1 = _mm512_set_epi32(30, 14, 28, 12, 26, 10, 24, 8, 22, 6, 20, 4, 18, 2, 16, 0);
__m512i pm1Hi1 = _mm512_set_epi32(31, 15, 29, 13, 27, 11, 25, 9, 23, 7, 21, 5, 19, 3, 17, 1);
__m512i pm4Lo1 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
__m512i pm4Hi1 = _mm512_set_epi32(31, 30, 29, 28, 15, 14, 13, 12, 23, 22, 21, 20, 7, 6, 5, 4);
__m512 upper4 = _mm512_shuffle_f32x4(sum893, sum901, 238);
__m512 upper5 = _mm512_shuffle_f32x4(sum897, sum905, 238);
sum893 = _mm512_shuffle_f32x4(sum893, sum901, 68);
sum897 = _mm512_shuffle_f32x4(sum897, sum905, 68);
sum893 = _mm512_add_ps(sum893, upper4);
sum897 = _mm512_add_ps(sum897, upper5);
__m512 upper7 = _mm512_shuffle_f32x4(sum895, sum903, 238);
__m512 upper8 = _mm512_shuffle_f32x4(sum899, sum907, 238);
sum895 = _mm512_shuffle_f32x4(sum895, sum903, 68);
sum899 = _mm512_shuffle_f32x4(sum899, sum907, 68);
sum895 = _mm512_add_ps(sum895, upper7);
sum899 = _mm512_add_ps(sum899, upper8);
__m512 upper3 = _mm512_permutex2var_ps(sum893, pm4Hi1, sum897);
__m512 upper6 = _mm512_permutex2var_ps(sum895, pm4Hi1, sum899);
sum893 = _mm512_permutex2var_ps(sum893, pm4Lo1, sum897);
sum895 = _mm512_permutex2var_ps(sum895, pm4Lo1, sum899);
sum893 = _mm512_add_ps(sum893, upper3);
sum895 = _mm512_add_ps(sum895, upper6);
__m512 upper11 = _mm512_shuffle_f32x4(sum894, sum902, 238);
__m512 upper12 = _mm512_shuffle_f32x4(sum898, sum906, 238);
sum894 = _mm512_shuffle_f32x4(sum894, sum902, 68);
sum898 = _mm512_shuffle_f32x4(sum898, sum906, 68);
sum894 = _mm512_add_ps(sum894, upper11);
sum898 = _mm512_add_ps(sum898, upper12);
__m512 upper14 = _mm512_shuffle_f32x4(sum896, sum904, 238);
__m512 upper15 = _mm512_shuffle_f32x4(sum900, sum908, 238);
sum896 = _mm512_shuffle_f32x4(sum896, sum904, 68);
sum900 = _mm512_shuffle_f32x4(sum900, sum908, 68);
sum896 = _mm512_add_ps(sum896, upper14);
sum900 = _mm512_add_ps(sum900, upper15);
__m512 upper10 = _mm512_permutex2var_ps(sum894, pm4Hi1, sum898);
__m512 upper13 = _mm512_permutex2var_ps(sum896, pm4Hi1, sum900);
sum894 = _mm512_permutex2var_ps(sum894, pm4Lo1, sum898);
sum896 = _mm512_permutex2var_ps(sum896, pm4Lo1, sum900);
sum894 = _mm512_add_ps(sum894, upper10);
sum896 = _mm512_add_ps(sum896, upper13);
__m512 upper2 = _mm512_shuffle_ps(sum893, sum895, 238);
__m512 upper9 = _mm512_shuffle_ps(sum894, sum896, 238);
sum893 = _mm512_shuffle_ps(sum893, sum895, 68);
sum894 = _mm512_shuffle_ps(sum894, sum896, 68);
sum893 = _mm512_add_ps(sum893, upper2);
sum894 = _mm512_add_ps(sum894, upper9);
__m512 upper1 = _mm512_permutex2var_ps(sum893, pm1Hi1, sum894);
sum893 = _mm512_permutex2var_ps(sum893, pm1Lo1, sum894);
sum893 = _mm512_add_ps(sum893, upper1);
sum893 = _mm512_add_ps(sum893, bias12);
_mm512_mask_storeu_ps(datPtr57+(ptrdiff_t)0+(ptrdiff_t)64*i103, 65535, sum893);
}
return;
}
for (ptrdiff_t i104 = 0; i104 < 1; ++i104) {
__m512 sum909 = _mm512_setzero_ps();
__m512 sum910 = _mm512_setzero_ps();
__m512 sum911 = _mm512_setzero_ps();
__m512 sum912 = _mm512_setzero_ps();
__m512 sum913 = _mm512_setzero_ps();
__m512 sum914 = _mm512_setzero_ps();
__m512 sum915 = _mm512_setzero_ps();
__m512 sum916 = _mm512_setzero_ps();
for (ptrdiff_t j94 = 0; j94 < 128; ++j94) {
__m512i wts9 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)0+(ptrdiff_t)65536*i104+(ptrdiff_t)256*j94);
__m512 dat2713 = _mm512_maskz_loadu_ps(65535, datPtr56+(ptrdiff_t)0+(ptrdiff_t)64*j94);
__m512i wts10 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)64+(ptrdiff_t)65536*i104+(ptrdiff_t)256*j94);
__m512i wts11 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)128+(ptrdiff_t)65536*i104+(ptrdiff_t)256*j94);
__m512i wts12 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)192+(ptrdiff_t)65536*i104+(ptrdiff_t)256*j94);
__m512 wtLo25 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts9));
__m512 wtHi25 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts9, 1));
__m512 wtLo26 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts10));
__m512 wtHi26 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts10, 1));
__m512 wtLo27 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts11));
__m512 wtHi27 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts11, 1));
__m512 wtLo28 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts12));
__m512 wtHi28 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts12, 1));
sum909 = _mm512_fmadd_ps(wtLo25, dat2713, sum909);
sum910 = _mm512_fmadd_ps(wtHi25, dat2713, sum910);
sum911 = _mm512_fmadd_ps(wtLo26, dat2713, sum911);
sum912 = _mm512_fmadd_ps(wtHi26, dat2713, sum912);
sum913 = _mm512_fmadd_ps(wtLo27, dat2713, sum913);
sum914 = _mm512_fmadd_ps(wtHi27, dat2713, sum914);
sum915 = _mm512_fmadd_ps(wtLo28, dat2713, sum915);
sum916 = _mm512_fmadd_ps(wtHi28, dat2713, sum916);
}
__m512 bias13 = _mm512_maskz_loadu_ps(255, biasPtr26+(ptrdiff_t)0+(ptrdiff_t)32*i104);
__m512i pmEven1 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmOdd1 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512i pm4Lo2 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
__m512i pm4Hi2 = _mm512_set_epi32(31, 30, 29, 28, 15, 14, 13, 12, 23, 22, 21, 20, 7, 6, 5, 4);
__m512 upper18 = _mm512_shuffle_f32x4(sum909, sum913, 238);
__m512 upper19 = _mm512_shuffle_f32x4(sum911, sum915, 238);
sum909 = _mm512_shuffle_f32x4(sum909, sum913, 68);
sum911 = _mm512_shuffle_f32x4(sum911, sum915, 68);
sum909 = _mm512_add_ps(sum909, upper18);
sum911 = _mm512_add_ps(sum911, upper19);
__m512 upper21 = _mm512_shuffle_f32x4(sum910, sum914, 238);
__m512 upper22 = _mm512_shuffle_f32x4(sum912, sum916, 238);
sum910 = _mm512_shuffle_f32x4(sum910, sum914, 68);
sum912 = _mm512_shuffle_f32x4(sum912, sum916, 68);
sum910 = _mm512_add_ps(sum910, upper21);
sum912 = _mm512_add_ps(sum912, upper22);
__m512 upper17 = _mm512_permutex2var_ps(sum909, pm4Hi2, sum911);
__m512 upper20 = _mm512_permutex2var_ps(sum910, pm4Hi2, sum912);
sum909 = _mm512_permutex2var_ps(sum909, pm4Lo2, sum911);
sum910 = _mm512_permutex2var_ps(sum910, pm4Lo2, sum912);
sum909 = _mm512_add_ps(sum909, upper17);
sum910 = _mm512_add_ps(sum910, upper20);
__m512 upper16 = _mm512_shuffle_ps(sum909, sum910, 238);
sum909 = _mm512_shuffle_ps(sum909, sum910, 68);
sum909 = _mm512_add_ps(sum909, upper16);
__m512 upper23 = _mm512_permutexvar_ps(pmOdd1, sum909);
sum909 = _mm512_permutexvar_ps(pmEven1, sum909);
sum909 = _mm512_add_ps(sum909, upper23);
sum909 = _mm512_add_ps(sum909, bias13);
_mm512_mask_storeu_ps(datPtr57+(ptrdiff_t)0+(ptrdiff_t)32*i104, 255, sum909);
}
}

static void ResNeXt50FcApply1(ResNeXt50ThreaderTeam1* team98, char** tensors171) {
ResNeXt50ThreaderTask1 task177;
task177.callee1 = ResNeXt50FcApply1Callee1;
task177.any1 = tensors171;
task177.nd1 = 1;
task177.hull1[0] = 63;
ResNeXt50ThreaderDo1(team98, &task177);
}

static void ResNeXt50OneArrangeWts1Callee1(ResNeXt50ThreaderTask1* task14, int64_t* pt12) {
char** tensors12 = task14->any1;
ptrdiff_t b44 = pt12[0];
char*restrict wtPtr2 = tensors12[0]+(ptrdiff_t)3340*0+(ptrdiff_t)98304*0;
char*restrict biasPtr2 = tensors12[1]+(ptrdiff_t)1536*0;
char*restrict bnPtr3 = tensors12[2]+(ptrdiff_t)8*384*0;
char*restrict wtPtr3 = tensors12[3]+(ptrdiff_t)3340*0+(ptrdiff_t)98304*0;
char*restrict biasPtr3 = tensors12[4]+(ptrdiff_t)1536*0;
char*restrict bnPtr4 = tensors12[5]+(ptrdiff_t)8*384*0;
char*restrict arranged1 = tensors12[6]+(ptrdiff_t)1284096*0+(ptrdiff_t)99840*0;
ptrdiff_t ii1 = 1;
for (ptrdiff_t i13 = 0; i13 < ii1; ++i13) {
ptrdiff_t j7 = 8*b44;
ptrdiff_t jj19 = j7+8;
for (; j7 < jj19; ++j7) {
if (j7 < 16) {
ptrdiff_t k46 = 0+16*(j7-0);
ptrdiff_t l9 = (size_t)(0+k46)/6;
ptrdiff_t cut1 = (size_t)(0+k46)%6;
switch (cut1) {
case 0:;
case 2: {
__m512 sum2 = _mm512_maskz_loadu_ps(65535, biasPtr2+1536*i13+4*k46);
__m512i pmMul2 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd2 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo1 = _mm512_loadu_ps(bnPtr3+(ptrdiff_t)8*(k46+384*i13));
__m512 masHi1 = _mm512_maskz_loadu_ps(65535, bnPtr3+(ptrdiff_t)8*(k46+384*i13)+(ptrdiff_t)64);
__m512 postMul4 = _mm512_permutex2var_ps(masLo1, pmMul2, masHi1);
__m512 postAdd2 = _mm512_permutex2var_ps(masLo1, pmAdd2, masHi1);
sum2 = _mm512_fmadd_ps(sum2, postMul4, postAdd2);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*0+(ptrdiff_t)0, 63>>cut1, sum2);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*0+(ptrdiff_t)1536, 4032>>cut1, sum2);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*0+(ptrdiff_t)3072, 65535-(4095>>cut1), sum2);
ptrdiff_t c5 = 0;
for (; c5 != 4; ++c5) {
__m512 wt15 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)0);
__m512 wt16 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)256);
__m512 wt17 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)512);
__m512 wt18 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)768);
__m512 wt19 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)1024);
__m512 wt20 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)1280);
__m512 wt21 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)1536);
__m512 wt22 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)1792);
__m512 wt23 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)2048);
__m512 wt24 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)2304);
__m512 wt25 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)2560);
__m512 wt26 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)2816);
__m512 wt27 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)3072);
__m512 wt28 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)3328);
__m512 wt29 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)3584);
__m512 wt30 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)3840);
__m512 tmp1 = _mm512_unpacklo_ps(wt15, wt16);
__m512 tmp2 = _mm512_unpackhi_ps(wt15, wt16);
__m512 tmp3 = _mm512_unpacklo_ps(wt17, wt18);
__m512 tmp4 = _mm512_unpackhi_ps(wt17, wt18);
__m512 tmp5 = _mm512_unpacklo_ps(wt19, wt20);
__m512 tmp6 = _mm512_unpackhi_ps(wt19, wt20);
__m512 tmp7 = _mm512_unpacklo_ps(wt21, wt22);
__m512 tmp8 = _mm512_unpackhi_ps(wt21, wt22);
__m512 tmp9 = _mm512_unpacklo_ps(wt23, wt24);
__m512 tmp10 = _mm512_unpackhi_ps(wt23, wt24);
__m512 tmp11 = _mm512_unpacklo_ps(wt25, wt26);
__m512 tmp12 = _mm512_unpackhi_ps(wt25, wt26);
__m512 tmp13 = _mm512_unpacklo_ps(wt27, wt28);
__m512 tmp14 = _mm512_unpackhi_ps(wt27, wt28);
__m512 tmp15 = _mm512_unpacklo_ps(wt29, wt30);
__m512 tmp16 = _mm512_unpackhi_ps(wt29, wt30);
__m512 tmp17 = _mm512_shuffle_ps(tmp1, tmp3, 68);
__m512 tmp18 = _mm512_shuffle_ps(tmp1, tmp3, 238);
__m512 tmp19 = _mm512_shuffle_ps(tmp2, tmp4, 68);
__m512 tmp20 = _mm512_shuffle_ps(tmp2, tmp4, 238);
__m512 tmp21 = _mm512_shuffle_ps(tmp5, tmp7, 68);
__m512 tmp22 = _mm512_shuffle_ps(tmp5, tmp7, 238);
__m512 tmp23 = _mm512_shuffle_ps(tmp6, tmp8, 68);
__m512 tmp24 = _mm512_shuffle_ps(tmp6, tmp8, 238);
__m512 tmp25 = _mm512_shuffle_ps(tmp9, tmp11, 68);
__m512 tmp26 = _mm512_shuffle_ps(tmp9, tmp11, 238);
__m512 tmp27 = _mm512_shuffle_ps(tmp10, tmp12, 68);
__m512 tmp28 = _mm512_shuffle_ps(tmp10, tmp12, 238);
__m512 tmp29 = _mm512_shuffle_ps(tmp13, tmp15, 68);
__m512 tmp30 = _mm512_shuffle_ps(tmp13, tmp15, 238);
__m512 tmp31 = _mm512_shuffle_ps(tmp14, tmp16, 68);
__m512 tmp32 = _mm512_shuffle_ps(tmp14, tmp16, 238);
__m512 tmp33 = _mm512_shuffle_f32x4(tmp17, tmp21, 136);
__m512 tmp34 = _mm512_shuffle_f32x4(tmp17, tmp21, 221);
__m512 tmp35 = _mm512_shuffle_f32x4(tmp18, tmp22, 136);
__m512 tmp36 = _mm512_shuffle_f32x4(tmp18, tmp22, 221);
__m512 tmp37 = _mm512_shuffle_f32x4(tmp19, tmp23, 136);
__m512 tmp38 = _mm512_shuffle_f32x4(tmp19, tmp23, 221);
__m512 tmp39 = _mm512_shuffle_f32x4(tmp20, tmp24, 136);
__m512 tmp40 = _mm512_shuffle_f32x4(tmp20, tmp24, 221);
__m512 tmp41 = _mm512_shuffle_f32x4(tmp25, tmp29, 136);
__m512 tmp42 = _mm512_shuffle_f32x4(tmp25, tmp29, 221);
__m512 tmp43 = _mm512_shuffle_f32x4(tmp26, tmp30, 136);
__m512 tmp44 = _mm512_shuffle_f32x4(tmp26, tmp30, 221);
__m512 tmp45 = _mm512_shuffle_f32x4(tmp27, tmp31, 136);
__m512 tmp46 = _mm512_shuffle_f32x4(tmp27, tmp31, 221);
__m512 tmp47 = _mm512_shuffle_f32x4(tmp28, tmp32, 136);
__m512 tmp48 = _mm512_shuffle_f32x4(tmp28, tmp32, 221);
wt15 = _mm512_shuffle_f32x4(tmp33, tmp41, 136);
wt23 = _mm512_shuffle_f32x4(tmp33, tmp41, 221);
wt16 = _mm512_shuffle_f32x4(tmp35, tmp43, 136);
wt24 = _mm512_shuffle_f32x4(tmp35, tmp43, 221);
wt17 = _mm512_shuffle_f32x4(tmp37, tmp45, 136);
wt25 = _mm512_shuffle_f32x4(tmp37, tmp45, 221);
wt18 = _mm512_shuffle_f32x4(tmp39, tmp47, 136);
wt26 = _mm512_shuffle_f32x4(tmp39, tmp47, 221);
wt19 = _mm512_shuffle_f32x4(tmp34, tmp42, 136);
wt27 = _mm512_shuffle_f32x4(tmp34, tmp42, 221);
wt20 = _mm512_shuffle_f32x4(tmp36, tmp44, 136);
wt28 = _mm512_shuffle_f32x4(tmp36, tmp44, 221);
wt21 = _mm512_shuffle_f32x4(tmp38, tmp46, 136);
wt29 = _mm512_shuffle_f32x4(tmp38, tmp46, 221);
wt22 = _mm512_shuffle_f32x4(tmp40, tmp48, 136);
wt30 = _mm512_shuffle_f32x4(tmp40, tmp48, 221);
wt15 = _mm512_mul_ps(wt15, postMul4);
wt16 = _mm512_mul_ps(wt16, postMul4);
wt17 = _mm512_mul_ps(wt17, postMul4);
wt18 = _mm512_mul_ps(wt18, postMul4);
wt19 = _mm512_mul_ps(wt19, postMul4);
wt20 = _mm512_mul_ps(wt20, postMul4);
wt21 = _mm512_mul_ps(wt21, postMul4);
wt22 = _mm512_mul_ps(wt22, postMul4);
wt23 = _mm512_mul_ps(wt23, postMul4);
wt24 = _mm512_mul_ps(wt24, postMul4);
wt25 = _mm512_mul_ps(wt25, postMul4);
wt26 = _mm512_mul_ps(wt26, postMul4);
wt27 = _mm512_mul_ps(wt27, postMul4);
wt28 = _mm512_mul_ps(wt28, postMul4);
wt29 = _mm512_mul_ps(wt29, postMul4);
wt30 = _mm512_mul_ps(wt30, postMul4);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(1+16*c5)+(ptrdiff_t)0, 63>>cut1, wt15);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(2+16*c5)+(ptrdiff_t)0, 63>>cut1, wt16);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(3+16*c5)+(ptrdiff_t)0, 63>>cut1, wt17);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(4+16*c5)+(ptrdiff_t)0, 63>>cut1, wt18);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(5+16*c5)+(ptrdiff_t)0, 63>>cut1, wt19);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(6+16*c5)+(ptrdiff_t)0, 63>>cut1, wt20);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(7+16*c5)+(ptrdiff_t)0, 63>>cut1, wt21);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(8+16*c5)+(ptrdiff_t)0, 63>>cut1, wt22);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(9+16*c5)+(ptrdiff_t)0, 63>>cut1, wt23);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(10+16*c5)+(ptrdiff_t)0, 63>>cut1, wt24);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(11+16*c5)+(ptrdiff_t)0, 63>>cut1, wt25);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(12+16*c5)+(ptrdiff_t)0, 63>>cut1, wt26);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(13+16*c5)+(ptrdiff_t)0, 63>>cut1, wt27);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(14+16*c5)+(ptrdiff_t)0, 63>>cut1, wt28);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(15+16*c5)+(ptrdiff_t)0, 63>>cut1, wt29);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(16+16*c5)+(ptrdiff_t)0, 63>>cut1, wt30);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(1+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt15);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(2+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt16);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(3+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt17);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(4+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt18);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(5+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt19);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(6+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt20);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(7+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt21);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(8+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt22);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(9+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt23);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(10+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt24);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(11+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt25);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(12+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt26);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(13+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt27);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(14+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt28);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(15+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt29);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(16+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt30);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(1+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt15);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(2+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt16);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(3+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt17);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(4+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt18);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(5+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt19);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(6+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt20);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(7+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt21);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(8+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt22);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(9+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt23);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(10+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt24);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(11+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt25);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(12+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt26);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(13+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt27);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(14+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt28);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(15+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt29);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(16+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt30);
}
break;
}
default: {
cut1 = 4;
__m512 sum3 = _mm512_maskz_loadu_ps(65535, biasPtr2+1536*i13+4*k46);
__m512i pmMul3 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd3 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo2 = _mm512_loadu_ps(bnPtr3+(ptrdiff_t)8*(k46+384*i13));
__m512 masHi2 = _mm512_maskz_loadu_ps(65535, bnPtr3+(ptrdiff_t)8*(k46+384*i13)+(ptrdiff_t)64);
__m512 postMul5 = _mm512_permutex2var_ps(masLo2, pmMul3, masHi2);
__m512 postAdd3 = _mm512_permutex2var_ps(masLo2, pmAdd3, masHi2);
sum3 = _mm512_fmadd_ps(sum3, postMul5, postAdd3);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*0+(ptrdiff_t)0, 63>>cut1, sum3);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*0+(ptrdiff_t)1536, 4032>>cut1, sum3);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*0+(ptrdiff_t)3072, 258048>>cut1, sum3);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*0+(ptrdiff_t)4608, 65535-(262143>>cut1), sum3);
ptrdiff_t c6 = 0;
for (; c6 != 4; ++c6) {
__m512 wt31 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)0);
__m512 wt32 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)256);
__m512 wt33 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)512);
__m512 wt34 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)768);
__m512 wt35 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)1024);
__m512 wt36 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)1280);
__m512 wt37 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)1536);
__m512 wt38 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)1792);
__m512 wt39 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)2048);
__m512 wt40 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)2304);
__m512 wt41 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)2560);
__m512 wt42 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)2816);
__m512 wt43 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)3072);
__m512 wt44 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)3328);
__m512 wt45 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)3584);
__m512 wt46 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)3840);
__m512 tmp49 = _mm512_unpacklo_ps(wt31, wt32);
__m512 tmp50 = _mm512_unpackhi_ps(wt31, wt32);
__m512 tmp51 = _mm512_unpacklo_ps(wt33, wt34);
__m512 tmp52 = _mm512_unpackhi_ps(wt33, wt34);
__m512 tmp53 = _mm512_unpacklo_ps(wt35, wt36);
__m512 tmp54 = _mm512_unpackhi_ps(wt35, wt36);
__m512 tmp55 = _mm512_unpacklo_ps(wt37, wt38);
__m512 tmp56 = _mm512_unpackhi_ps(wt37, wt38);
__m512 tmp57 = _mm512_unpacklo_ps(wt39, wt40);
__m512 tmp58 = _mm512_unpackhi_ps(wt39, wt40);
__m512 tmp59 = _mm512_unpacklo_ps(wt41, wt42);
__m512 tmp60 = _mm512_unpackhi_ps(wt41, wt42);
__m512 tmp61 = _mm512_unpacklo_ps(wt43, wt44);
__m512 tmp62 = _mm512_unpackhi_ps(wt43, wt44);
__m512 tmp63 = _mm512_unpacklo_ps(wt45, wt46);
__m512 tmp64 = _mm512_unpackhi_ps(wt45, wt46);
__m512 tmp65 = _mm512_shuffle_ps(tmp49, tmp51, 68);
__m512 tmp66 = _mm512_shuffle_ps(tmp49, tmp51, 238);
__m512 tmp67 = _mm512_shuffle_ps(tmp50, tmp52, 68);
__m512 tmp68 = _mm512_shuffle_ps(tmp50, tmp52, 238);
__m512 tmp69 = _mm512_shuffle_ps(tmp53, tmp55, 68);
__m512 tmp70 = _mm512_shuffle_ps(tmp53, tmp55, 238);
__m512 tmp71 = _mm512_shuffle_ps(tmp54, tmp56, 68);
__m512 tmp72 = _mm512_shuffle_ps(tmp54, tmp56, 238);
__m512 tmp73 = _mm512_shuffle_ps(tmp57, tmp59, 68);
__m512 tmp74 = _mm512_shuffle_ps(tmp57, tmp59, 238);
__m512 tmp75 = _mm512_shuffle_ps(tmp58, tmp60, 68);
__m512 tmp76 = _mm512_shuffle_ps(tmp58, tmp60, 238);
__m512 tmp77 = _mm512_shuffle_ps(tmp61, tmp63, 68);
__m512 tmp78 = _mm512_shuffle_ps(tmp61, tmp63, 238);
__m512 tmp79 = _mm512_shuffle_ps(tmp62, tmp64, 68);
__m512 tmp80 = _mm512_shuffle_ps(tmp62, tmp64, 238);
__m512 tmp81 = _mm512_shuffle_f32x4(tmp65, tmp69, 136);
__m512 tmp82 = _mm512_shuffle_f32x4(tmp65, tmp69, 221);
__m512 tmp83 = _mm512_shuffle_f32x4(tmp66, tmp70, 136);
__m512 tmp84 = _mm512_shuffle_f32x4(tmp66, tmp70, 221);
__m512 tmp85 = _mm512_shuffle_f32x4(tmp67, tmp71, 136);
__m512 tmp86 = _mm512_shuffle_f32x4(tmp67, tmp71, 221);
__m512 tmp87 = _mm512_shuffle_f32x4(tmp68, tmp72, 136);
__m512 tmp88 = _mm512_shuffle_f32x4(tmp68, tmp72, 221);
__m512 tmp89 = _mm512_shuffle_f32x4(tmp73, tmp77, 136);
__m512 tmp90 = _mm512_shuffle_f32x4(tmp73, tmp77, 221);
__m512 tmp91 = _mm512_shuffle_f32x4(tmp74, tmp78, 136);
__m512 tmp92 = _mm512_shuffle_f32x4(tmp74, tmp78, 221);
__m512 tmp93 = _mm512_shuffle_f32x4(tmp75, tmp79, 136);
__m512 tmp94 = _mm512_shuffle_f32x4(tmp75, tmp79, 221);
__m512 tmp95 = _mm512_shuffle_f32x4(tmp76, tmp80, 136);
__m512 tmp96 = _mm512_shuffle_f32x4(tmp76, tmp80, 221);
wt31 = _mm512_shuffle_f32x4(tmp81, tmp89, 136);
wt39 = _mm512_shuffle_f32x4(tmp81, tmp89, 221);
wt32 = _mm512_shuffle_f32x4(tmp83, tmp91, 136);
wt40 = _mm512_shuffle_f32x4(tmp83, tmp91, 221);
wt33 = _mm512_shuffle_f32x4(tmp85, tmp93, 136);
wt41 = _mm512_shuffle_f32x4(tmp85, tmp93, 221);
wt34 = _mm512_shuffle_f32x4(tmp87, tmp95, 136);
wt42 = _mm512_shuffle_f32x4(tmp87, tmp95, 221);
wt35 = _mm512_shuffle_f32x4(tmp82, tmp90, 136);
wt43 = _mm512_shuffle_f32x4(tmp82, tmp90, 221);
wt36 = _mm512_shuffle_f32x4(tmp84, tmp92, 136);
wt44 = _mm512_shuffle_f32x4(tmp84, tmp92, 221);
wt37 = _mm512_shuffle_f32x4(tmp86, tmp94, 136);
wt45 = _mm512_shuffle_f32x4(tmp86, tmp94, 221);
wt38 = _mm512_shuffle_f32x4(tmp88, tmp96, 136);
wt46 = _mm512_shuffle_f32x4(tmp88, tmp96, 221);
wt31 = _mm512_mul_ps(wt31, postMul5);
wt32 = _mm512_mul_ps(wt32, postMul5);
wt33 = _mm512_mul_ps(wt33, postMul5);
wt34 = _mm512_mul_ps(wt34, postMul5);
wt35 = _mm512_mul_ps(wt35, postMul5);
wt36 = _mm512_mul_ps(wt36, postMul5);
wt37 = _mm512_mul_ps(wt37, postMul5);
wt38 = _mm512_mul_ps(wt38, postMul5);
wt39 = _mm512_mul_ps(wt39, postMul5);
wt40 = _mm512_mul_ps(wt40, postMul5);
wt41 = _mm512_mul_ps(wt41, postMul5);
wt42 = _mm512_mul_ps(wt42, postMul5);
wt43 = _mm512_mul_ps(wt43, postMul5);
wt44 = _mm512_mul_ps(wt44, postMul5);
wt45 = _mm512_mul_ps(wt45, postMul5);
wt46 = _mm512_mul_ps(wt46, postMul5);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(1+16*c6)+(ptrdiff_t)0, 63>>cut1, wt31);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(2+16*c6)+(ptrdiff_t)0, 63>>cut1, wt32);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(3+16*c6)+(ptrdiff_t)0, 63>>cut1, wt33);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(4+16*c6)+(ptrdiff_t)0, 63>>cut1, wt34);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(5+16*c6)+(ptrdiff_t)0, 63>>cut1, wt35);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(6+16*c6)+(ptrdiff_t)0, 63>>cut1, wt36);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(7+16*c6)+(ptrdiff_t)0, 63>>cut1, wt37);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(8+16*c6)+(ptrdiff_t)0, 63>>cut1, wt38);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(9+16*c6)+(ptrdiff_t)0, 63>>cut1, wt39);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(10+16*c6)+(ptrdiff_t)0, 63>>cut1, wt40);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(11+16*c6)+(ptrdiff_t)0, 63>>cut1, wt41);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(12+16*c6)+(ptrdiff_t)0, 63>>cut1, wt42);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(13+16*c6)+(ptrdiff_t)0, 63>>cut1, wt43);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(14+16*c6)+(ptrdiff_t)0, 63>>cut1, wt44);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(15+16*c6)+(ptrdiff_t)0, 63>>cut1, wt45);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(16+16*c6)+(ptrdiff_t)0, 63>>cut1, wt46);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(1+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt31);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(2+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt32);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(3+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt33);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(4+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt34);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(5+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt35);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(6+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt36);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(7+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt37);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(8+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt38);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(9+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt39);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(10+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt40);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(11+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt41);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(12+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt42);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(13+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt43);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(14+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt44);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(15+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt45);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(16+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt46);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(1+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt31);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(2+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt32);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(3+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt33);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(4+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt34);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(5+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt35);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(6+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt36);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(7+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt37);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(8+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt38);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(9+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt39);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(10+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt40);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(11+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt41);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(12+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt42);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(13+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt43);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(14+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt44);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(15+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt45);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(16+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt46);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(1+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt31);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(2+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt32);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(3+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt33);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(4+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt34);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(5+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt35);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(6+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt36);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(7+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt37);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(8+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt38);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(9+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt39);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(10+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt40);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(11+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt41);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(12+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt42);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(13+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt43);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(14+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt44);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(15+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt45);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(16+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt46);
}
}
}
} else {
ptrdiff_t k47 = 0+16*(j7-16);
ptrdiff_t l10 = (size_t)(256+k47)/6;
ptrdiff_t cut2 = (size_t)(256+k47)%6;
switch (cut2) {
case 0:;
case 2: {
__m512 sum4 = _mm512_maskz_loadu_ps(65535, biasPtr3+1536*i13+4*k47);
__m512i pmMul4 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd4 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo3 = _mm512_loadu_ps(bnPtr4+(ptrdiff_t)8*(k47+384*i13));
__m512 masHi3 = _mm512_maskz_loadu_ps(65535, bnPtr4+(ptrdiff_t)8*(k47+384*i13)+(ptrdiff_t)64);
__m512 postMul6 = _mm512_permutex2var_ps(masLo3, pmMul4, masHi3);
__m512 postAdd4 = _mm512_permutex2var_ps(masLo3, pmAdd4, masHi3);
sum4 = _mm512_fmadd_ps(sum4, postMul6, postAdd4);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*0+(ptrdiff_t)0, 63>>cut2, sum4);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*0+(ptrdiff_t)1536, 4032>>cut2, sum4);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*0+(ptrdiff_t)3072, 65535-(4095>>cut2), sum4);
ptrdiff_t c7 = 0;
for (; c7 != 4; ++c7) {
__m512 wt47 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)0);
__m512 wt48 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)256);
__m512 wt49 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)512);
__m512 wt50 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)768);
__m512 wt51 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)1024);
__m512 wt52 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)1280);
__m512 wt53 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)1536);
__m512 wt54 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)1792);
__m512 wt55 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)2048);
__m512 wt56 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)2304);
__m512 wt57 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)2560);
__m512 wt58 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)2816);
__m512 wt59 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)3072);
__m512 wt60 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)3328);
__m512 wt61 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)3584);
__m512 wt62 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)3840);
__m512 tmp97 = _mm512_unpacklo_ps(wt47, wt48);
__m512 tmp98 = _mm512_unpackhi_ps(wt47, wt48);
__m512 tmp99 = _mm512_unpacklo_ps(wt49, wt50);
__m512 tmp100 = _mm512_unpackhi_ps(wt49, wt50);
__m512 tmp101 = _mm512_unpacklo_ps(wt51, wt52);
__m512 tmp102 = _mm512_unpackhi_ps(wt51, wt52);
__m512 tmp103 = _mm512_unpacklo_ps(wt53, wt54);
__m512 tmp104 = _mm512_unpackhi_ps(wt53, wt54);
__m512 tmp105 = _mm512_unpacklo_ps(wt55, wt56);
__m512 tmp106 = _mm512_unpackhi_ps(wt55, wt56);
__m512 tmp107 = _mm512_unpacklo_ps(wt57, wt58);
__m512 tmp108 = _mm512_unpackhi_ps(wt57, wt58);
__m512 tmp109 = _mm512_unpacklo_ps(wt59, wt60);
__m512 tmp110 = _mm512_unpackhi_ps(wt59, wt60);
__m512 tmp111 = _mm512_unpacklo_ps(wt61, wt62);
__m512 tmp112 = _mm512_unpackhi_ps(wt61, wt62);
__m512 tmp113 = _mm512_shuffle_ps(tmp97, tmp99, 68);
__m512 tmp114 = _mm512_shuffle_ps(tmp97, tmp99, 238);
__m512 tmp115 = _mm512_shuffle_ps(tmp98, tmp100, 68);
__m512 tmp116 = _mm512_shuffle_ps(tmp98, tmp100, 238);
__m512 tmp117 = _mm512_shuffle_ps(tmp101, tmp103, 68);
__m512 tmp118 = _mm512_shuffle_ps(tmp101, tmp103, 238);
__m512 tmp119 = _mm512_shuffle_ps(tmp102, tmp104, 68);
__m512 tmp120 = _mm512_shuffle_ps(tmp102, tmp104, 238);
__m512 tmp121 = _mm512_shuffle_ps(tmp105, tmp107, 68);
__m512 tmp122 = _mm512_shuffle_ps(tmp105, tmp107, 238);
__m512 tmp123 = _mm512_shuffle_ps(tmp106, tmp108, 68);
__m512 tmp124 = _mm512_shuffle_ps(tmp106, tmp108, 238);
__m512 tmp125 = _mm512_shuffle_ps(tmp109, tmp111, 68);
__m512 tmp126 = _mm512_shuffle_ps(tmp109, tmp111, 238);
__m512 tmp127 = _mm512_shuffle_ps(tmp110, tmp112, 68);
__m512 tmp128 = _mm512_shuffle_ps(tmp110, tmp112, 238);
__m512 tmp129 = _mm512_shuffle_f32x4(tmp113, tmp117, 136);
__m512 tmp130 = _mm512_shuffle_f32x4(tmp113, tmp117, 221);
__m512 tmp131 = _mm512_shuffle_f32x4(tmp114, tmp118, 136);
__m512 tmp132 = _mm512_shuffle_f32x4(tmp114, tmp118, 221);
__m512 tmp133 = _mm512_shuffle_f32x4(tmp115, tmp119, 136);
__m512 tmp134 = _mm512_shuffle_f32x4(tmp115, tmp119, 221);
__m512 tmp135 = _mm512_shuffle_f32x4(tmp116, tmp120, 136);
__m512 tmp136 = _mm512_shuffle_f32x4(tmp116, tmp120, 221);
__m512 tmp137 = _mm512_shuffle_f32x4(tmp121, tmp125, 136);
__m512 tmp138 = _mm512_shuffle_f32x4(tmp121, tmp125, 221);
__m512 tmp139 = _mm512_shuffle_f32x4(tmp122, tmp126, 136);
__m512 tmp140 = _mm512_shuffle_f32x4(tmp122, tmp126, 221);
__m512 tmp141 = _mm512_shuffle_f32x4(tmp123, tmp127, 136);
__m512 tmp142 = _mm512_shuffle_f32x4(tmp123, tmp127, 221);
__m512 tmp143 = _mm512_shuffle_f32x4(tmp124, tmp128, 136);
__m512 tmp144 = _mm512_shuffle_f32x4(tmp124, tmp128, 221);
wt47 = _mm512_shuffle_f32x4(tmp129, tmp137, 136);
wt55 = _mm512_shuffle_f32x4(tmp129, tmp137, 221);
wt48 = _mm512_shuffle_f32x4(tmp131, tmp139, 136);
wt56 = _mm512_shuffle_f32x4(tmp131, tmp139, 221);
wt49 = _mm512_shuffle_f32x4(tmp133, tmp141, 136);
wt57 = _mm512_shuffle_f32x4(tmp133, tmp141, 221);
wt50 = _mm512_shuffle_f32x4(tmp135, tmp143, 136);
wt58 = _mm512_shuffle_f32x4(tmp135, tmp143, 221);
wt51 = _mm512_shuffle_f32x4(tmp130, tmp138, 136);
wt59 = _mm512_shuffle_f32x4(tmp130, tmp138, 221);
wt52 = _mm512_shuffle_f32x4(tmp132, tmp140, 136);
wt60 = _mm512_shuffle_f32x4(tmp132, tmp140, 221);
wt53 = _mm512_shuffle_f32x4(tmp134, tmp142, 136);
wt61 = _mm512_shuffle_f32x4(tmp134, tmp142, 221);
wt54 = _mm512_shuffle_f32x4(tmp136, tmp144, 136);
wt62 = _mm512_shuffle_f32x4(tmp136, tmp144, 221);
wt47 = _mm512_mul_ps(wt47, postMul6);
wt48 = _mm512_mul_ps(wt48, postMul6);
wt49 = _mm512_mul_ps(wt49, postMul6);
wt50 = _mm512_mul_ps(wt50, postMul6);
wt51 = _mm512_mul_ps(wt51, postMul6);
wt52 = _mm512_mul_ps(wt52, postMul6);
wt53 = _mm512_mul_ps(wt53, postMul6);
wt54 = _mm512_mul_ps(wt54, postMul6);
wt55 = _mm512_mul_ps(wt55, postMul6);
wt56 = _mm512_mul_ps(wt56, postMul6);
wt57 = _mm512_mul_ps(wt57, postMul6);
wt58 = _mm512_mul_ps(wt58, postMul6);
wt59 = _mm512_mul_ps(wt59, postMul6);
wt60 = _mm512_mul_ps(wt60, postMul6);
wt61 = _mm512_mul_ps(wt61, postMul6);
wt62 = _mm512_mul_ps(wt62, postMul6);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(1+16*c7)+(ptrdiff_t)0, 63>>cut2, wt47);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(2+16*c7)+(ptrdiff_t)0, 63>>cut2, wt48);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(3+16*c7)+(ptrdiff_t)0, 63>>cut2, wt49);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(4+16*c7)+(ptrdiff_t)0, 63>>cut2, wt50);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(5+16*c7)+(ptrdiff_t)0, 63>>cut2, wt51);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(6+16*c7)+(ptrdiff_t)0, 63>>cut2, wt52);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(7+16*c7)+(ptrdiff_t)0, 63>>cut2, wt53);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(8+16*c7)+(ptrdiff_t)0, 63>>cut2, wt54);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(9+16*c7)+(ptrdiff_t)0, 63>>cut2, wt55);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(10+16*c7)+(ptrdiff_t)0, 63>>cut2, wt56);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(11+16*c7)+(ptrdiff_t)0, 63>>cut2, wt57);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(12+16*c7)+(ptrdiff_t)0, 63>>cut2, wt58);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(13+16*c7)+(ptrdiff_t)0, 63>>cut2, wt59);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(14+16*c7)+(ptrdiff_t)0, 63>>cut2, wt60);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(15+16*c7)+(ptrdiff_t)0, 63>>cut2, wt61);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(16+16*c7)+(ptrdiff_t)0, 63>>cut2, wt62);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(1+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt47);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(2+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt48);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(3+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt49);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(4+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt50);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(5+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt51);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(6+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt52);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(7+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt53);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(8+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt54);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(9+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt55);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(10+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt56);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(11+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt57);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(12+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt58);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(13+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt59);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(14+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt60);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(15+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt61);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(16+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt62);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(1+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt47);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(2+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt48);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(3+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt49);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(4+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt50);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(5+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt51);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(6+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt52);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(7+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt53);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(8+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt54);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(9+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt55);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(10+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt56);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(11+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt57);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(12+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt58);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(13+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt59);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(14+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt60);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(15+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt61);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(16+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt62);
}
break;
}
default: {
cut2 = 4;
__m512 sum5 = _mm512_maskz_loadu_ps(65535, biasPtr3+1536*i13+4*k47);
__m512i pmMul5 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd5 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo4 = _mm512_loadu_ps(bnPtr4+(ptrdiff_t)8*(k47+384*i13));
__m512 masHi4 = _mm512_maskz_loadu_ps(65535, bnPtr4+(ptrdiff_t)8*(k47+384*i13)+(ptrdiff_t)64);
__m512 postMul7 = _mm512_permutex2var_ps(masLo4, pmMul5, masHi4);
__m512 postAdd5 = _mm512_permutex2var_ps(masLo4, pmAdd5, masHi4);
sum5 = _mm512_fmadd_ps(sum5, postMul7, postAdd5);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*0+(ptrdiff_t)0, 63>>cut2, sum5);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*0+(ptrdiff_t)1536, 4032>>cut2, sum5);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*0+(ptrdiff_t)3072, 258048>>cut2, sum5);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*0+(ptrdiff_t)4608, 65535-(262143>>cut2), sum5);
ptrdiff_t c8 = 0;
for (; c8 != 4; ++c8) {
__m512 wt63 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)0);
__m512 wt64 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)256);
__m512 wt65 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)512);
__m512 wt66 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)768);
__m512 wt67 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)1024);
__m512 wt68 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)1280);
__m512 wt69 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)1536);
__m512 wt70 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)1792);
__m512 wt71 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)2048);
__m512 wt72 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)2304);
__m512 wt73 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)2560);
__m512 wt74 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)2816);
__m512 wt75 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)3072);
__m512 wt76 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)3328);
__m512 wt77 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)3584);
__m512 wt78 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)3840);
__m512 tmp145 = _mm512_unpacklo_ps(wt63, wt64);
__m512 tmp146 = _mm512_unpackhi_ps(wt63, wt64);
__m512 tmp147 = _mm512_unpacklo_ps(wt65, wt66);
__m512 tmp148 = _mm512_unpackhi_ps(wt65, wt66);
__m512 tmp149 = _mm512_unpacklo_ps(wt67, wt68);
__m512 tmp150 = _mm512_unpackhi_ps(wt67, wt68);
__m512 tmp151 = _mm512_unpacklo_ps(wt69, wt70);
__m512 tmp152 = _mm512_unpackhi_ps(wt69, wt70);
__m512 tmp153 = _mm512_unpacklo_ps(wt71, wt72);
__m512 tmp154 = _mm512_unpackhi_ps(wt71, wt72);
__m512 tmp155 = _mm512_unpacklo_ps(wt73, wt74);
__m512 tmp156 = _mm512_unpackhi_ps(wt73, wt74);
__m512 tmp157 = _mm512_unpacklo_ps(wt75, wt76);
__m512 tmp158 = _mm512_unpackhi_ps(wt75, wt76);
__m512 tmp159 = _mm512_unpacklo_ps(wt77, wt78);
__m512 tmp160 = _mm512_unpackhi_ps(wt77, wt78);
__m512 tmp161 = _mm512_shuffle_ps(tmp145, tmp147, 68);
__m512 tmp162 = _mm512_shuffle_ps(tmp145, tmp147, 238);
__m512 tmp163 = _mm512_shuffle_ps(tmp146, tmp148, 68);
__m512 tmp164 = _mm512_shuffle_ps(tmp146, tmp148, 238);
__m512 tmp165 = _mm512_shuffle_ps(tmp149, tmp151, 68);
__m512 tmp166 = _mm512_shuffle_ps(tmp149, tmp151, 238);
__m512 tmp167 = _mm512_shuffle_ps(tmp150, tmp152, 68);
__m512 tmp168 = _mm512_shuffle_ps(tmp150, tmp152, 238);
__m512 tmp169 = _mm512_shuffle_ps(tmp153, tmp155, 68);
__m512 tmp170 = _mm512_shuffle_ps(tmp153, tmp155, 238);
__m512 tmp171 = _mm512_shuffle_ps(tmp154, tmp156, 68);
__m512 tmp172 = _mm512_shuffle_ps(tmp154, tmp156, 238);
__m512 tmp173 = _mm512_shuffle_ps(tmp157, tmp159, 68);
__m512 tmp174 = _mm512_shuffle_ps(tmp157, tmp159, 238);
__m512 tmp175 = _mm512_shuffle_ps(tmp158, tmp160, 68);
__m512 tmp176 = _mm512_shuffle_ps(tmp158, tmp160, 238);
__m512 tmp177 = _mm512_shuffle_f32x4(tmp161, tmp165, 136);
__m512 tmp178 = _mm512_shuffle_f32x4(tmp161, tmp165, 221);
__m512 tmp179 = _mm512_shuffle_f32x4(tmp162, tmp166, 136);
__m512 tmp180 = _mm512_shuffle_f32x4(tmp162, tmp166, 221);
__m512 tmp181 = _mm512_shuffle_f32x4(tmp163, tmp167, 136);
__m512 tmp182 = _mm512_shuffle_f32x4(tmp163, tmp167, 221);
__m512 tmp183 = _mm512_shuffle_f32x4(tmp164, tmp168, 136);
__m512 tmp184 = _mm512_shuffle_f32x4(tmp164, tmp168, 221);
__m512 tmp185 = _mm512_shuffle_f32x4(tmp169, tmp173, 136);
__m512 tmp186 = _mm512_shuffle_f32x4(tmp169, tmp173, 221);
__m512 tmp187 = _mm512_shuffle_f32x4(tmp170, tmp174, 136);
__m512 tmp188 = _mm512_shuffle_f32x4(tmp170, tmp174, 221);
__m512 tmp189 = _mm512_shuffle_f32x4(tmp171, tmp175, 136);
__m512 tmp190 = _mm512_shuffle_f32x4(tmp171, tmp175, 221);
__m512 tmp191 = _mm512_shuffle_f32x4(tmp172, tmp176, 136);
__m512 tmp192 = _mm512_shuffle_f32x4(tmp172, tmp176, 221);
wt63 = _mm512_shuffle_f32x4(tmp177, tmp185, 136);
wt71 = _mm512_shuffle_f32x4(tmp177, tmp185, 221);
wt64 = _mm512_shuffle_f32x4(tmp179, tmp187, 136);
wt72 = _mm512_shuffle_f32x4(tmp179, tmp187, 221);
wt65 = _mm512_shuffle_f32x4(tmp181, tmp189, 136);
wt73 = _mm512_shuffle_f32x4(tmp181, tmp189, 221);
wt66 = _mm512_shuffle_f32x4(tmp183, tmp191, 136);
wt74 = _mm512_shuffle_f32x4(tmp183, tmp191, 221);
wt67 = _mm512_shuffle_f32x4(tmp178, tmp186, 136);
wt75 = _mm512_shuffle_f32x4(tmp178, tmp186, 221);
wt68 = _mm512_shuffle_f32x4(tmp180, tmp188, 136);
wt76 = _mm512_shuffle_f32x4(tmp180, tmp188, 221);
wt69 = _mm512_shuffle_f32x4(tmp182, tmp190, 136);
wt77 = _mm512_shuffle_f32x4(tmp182, tmp190, 221);
wt70 = _mm512_shuffle_f32x4(tmp184, tmp192, 136);
wt78 = _mm512_shuffle_f32x4(tmp184, tmp192, 221);
wt63 = _mm512_mul_ps(wt63, postMul7);
wt64 = _mm512_mul_ps(wt64, postMul7);
wt65 = _mm512_mul_ps(wt65, postMul7);
wt66 = _mm512_mul_ps(wt66, postMul7);
wt67 = _mm512_mul_ps(wt67, postMul7);
wt68 = _mm512_mul_ps(wt68, postMul7);
wt69 = _mm512_mul_ps(wt69, postMul7);
wt70 = _mm512_mul_ps(wt70, postMul7);
wt71 = _mm512_mul_ps(wt71, postMul7);
wt72 = _mm512_mul_ps(wt72, postMul7);
wt73 = _mm512_mul_ps(wt73, postMul7);
wt74 = _mm512_mul_ps(wt74, postMul7);
wt75 = _mm512_mul_ps(wt75, postMul7);
wt76 = _mm512_mul_ps(wt76, postMul7);
wt77 = _mm512_mul_ps(wt77, postMul7);
wt78 = _mm512_mul_ps(wt78, postMul7);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(1+16*c8)+(ptrdiff_t)0, 63>>cut2, wt63);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(2+16*c8)+(ptrdiff_t)0, 63>>cut2, wt64);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(3+16*c8)+(ptrdiff_t)0, 63>>cut2, wt65);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(4+16*c8)+(ptrdiff_t)0, 63>>cut2, wt66);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(5+16*c8)+(ptrdiff_t)0, 63>>cut2, wt67);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(6+16*c8)+(ptrdiff_t)0, 63>>cut2, wt68);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(7+16*c8)+(ptrdiff_t)0, 63>>cut2, wt69);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(8+16*c8)+(ptrdiff_t)0, 63>>cut2, wt70);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(9+16*c8)+(ptrdiff_t)0, 63>>cut2, wt71);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(10+16*c8)+(ptrdiff_t)0, 63>>cut2, wt72);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(11+16*c8)+(ptrdiff_t)0, 63>>cut2, wt73);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(12+16*c8)+(ptrdiff_t)0, 63>>cut2, wt74);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(13+16*c8)+(ptrdiff_t)0, 63>>cut2, wt75);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(14+16*c8)+(ptrdiff_t)0, 63>>cut2, wt76);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(15+16*c8)+(ptrdiff_t)0, 63>>cut2, wt77);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(16+16*c8)+(ptrdiff_t)0, 63>>cut2, wt78);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(1+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt63);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(2+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt64);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(3+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt65);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(4+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt66);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(5+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt67);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(6+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt68);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(7+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt69);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(8+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt70);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(9+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt71);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(10+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt72);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(11+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt73);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(12+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt74);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(13+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt75);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(14+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt76);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(15+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt77);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(16+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt78);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(1+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt63);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(2+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt64);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(3+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt65);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(4+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt66);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(5+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt67);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(6+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt68);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(7+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt69);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(8+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt70);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(9+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt71);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(10+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt72);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(11+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt73);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(12+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt74);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(13+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt75);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(14+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt76);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(15+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt77);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(16+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt78);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(1+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt63);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(2+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt64);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(3+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt65);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(4+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt66);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(5+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt67);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(6+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt68);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(7+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt69);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(8+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt70);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(9+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt71);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(10+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt72);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(11+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt73);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(12+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt74);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(13+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt75);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(14+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt76);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(15+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt77);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(16+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt78);
}
}
}
}
}
}
}

static void ResNeXt50OneArrangeWts1(ResNeXt50ThreaderTeam1* team19, char** tensors11) {
ResNeXt50ThreaderTask1 task15;
task15.callee1 = ResNeXt50OneArrangeWts1Callee1;
task15.any1 = tensors11;
task15.nd1 = 3;
task15.hull1[0] = 3;
task15.hull1[1] = 1;
task15.hull1[2] = 1;
ResNeXt50ThreaderDo1(team19, &task15);
}

static void ResNeXt50OneArrangeDats1Callee1(ResNeXt50ThreaderTask1* task16, int64_t* pt13) {
char** tensors14 = task16->any1;
ptrdiff_t c9 = pt13[1];
char*restrict datPtr3 = tensors14[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)806912*0;
char*restrict arranged2 = tensors14[1]+(ptrdiff_t)10474240*0+(ptrdiff_t)802816*0;
ptrdiff_t ii2 = 1;
for (ptrdiff_t i14 = 0; i14 < ii2; ++i14) {
ptrdiff_t j8 = 2*c9;
ptrdiff_t jj20 = j8+(c9 < 23 ? 1 : 2);
for (; j8 != 49; ++j8) {
ptrdiff_t k48 = 0;
ptrdiff_t kk24 = k48+64;
for (; k48 < kk24; ++k48) {
__m512 dat909 = _mm512_maskz_loadu_ps(65535, datPtr3+806912*i14+256*j8+12608*k48+(ptrdiff_t)0);
__m512 dat910 = _mm512_maskz_loadu_ps(65535, datPtr3+806912*i14+256*j8+12608*k48+(ptrdiff_t)64);
__m512 dat911 = _mm512_maskz_loadu_ps(65535, datPtr3+806912*i14+256*j8+12608*k48+(ptrdiff_t)128);
__m512 dat912 = _mm512_maskz_loadu_ps(65535, datPtr3+806912*i14+256*j8+12608*k48+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged2+802816*i14+16384*j8+256*k48+(ptrdiff_t)0, 65535, dat909);
_mm512_mask_storeu_ps(arranged2+802816*i14+16384*j8+256*k48+(ptrdiff_t)64, 65535, dat910);
_mm512_mask_storeu_ps(arranged2+802816*i14+16384*j8+256*k48+(ptrdiff_t)128, 65535, dat911);
_mm512_mask_storeu_ps(arranged2+802816*i14+16384*j8+256*k48+(ptrdiff_t)192, 65535, dat912);
}
if (j8 >= jj20) goto next1;
}
next1:;
}
}

static void ResNeXt50OneArrangeDats1(ResNeXt50ThreaderTeam1* team20, char** tensors13) {
ResNeXt50ThreaderTask1 task17;
task17.callee1 = ResNeXt50OneArrangeDats1Callee1;
task17.any1 = tensors13;
task17.nd1 = 4;
task17.hull1[0] = 1;
task17.hull1[1] = 24;
task17.hull1[2] = 1;
task17.hull1[3] = 1;
ResNeXt50ThreaderDo1(team20, &task17);
}

static void ResNeXt50OneApply1Callee1(ResNeXt50ThreaderTask1* task18, int64_t* pt14) {
void** pair2 = task18->any1;
char** tensors16 = pair2[0];
ptrdiff_t e6 = 0;
ptrdiff_t g6 = 0;
ptrdiff_t d3 = pt14[1];
ptrdiff_t w22 = pt14[0];
char*restrict arrangedWts1 = tensors16[0]+1284096*e6+(ptrdiff_t)99840*1*g6;
char*restrict arrangedDats1 = tensors16[1]+10474240*e6+(ptrdiff_t)802816*1*g6;
char*restrict datPtr4 = tensors16[2]+(ptrdiff_t)4841472*1*g6;
ptrdiff_t ii3 = 1;
for (ptrdiff_t i15 = 0; i15 < ii3; ++i15) {
ptrdiff_t j9 = 1*d3;
ptrdiff_t jj21 = j9+0;
for (; j9 != 49; ++j9) {
ptrdiff_t k49 = 8*w22;
ptrdiff_t kk25 = k49+7;
for (; k49 != 64; ++k49) {
ptrdiff_t s10 = -1;
__m512 sum6 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)24));
__m512 sum10 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)28));
__m512 sum14 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)32));
__m512 sum18 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)36));
__m512 sum22 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)40));
__m512 sum26 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)44));
__m512 sum7 = sum6;
__m512 sum8 = sum6;
__m512 sum9 = sum6;
__m512 sum11 = sum10;
__m512 sum12 = sum10;
__m512 sum13 = sum10;
__m512 sum15 = sum14;
__m512 sum16 = sum14;
__m512 sum17 = sum14;
__m512 sum19 = sum18;
__m512 sum20 = sum18;
__m512 sum21 = sum18;
__m512 sum23 = sum22;
__m512 sum24 = sum22;
__m512 sum25 = sum22;
__m512 sum27 = sum26;
__m512 sum28 = sum26;
__m512 sum29 = sum26;
for (s10 = 0; s10 < 64; ++s10) {
__m512 dat913 = _mm512_loadu_ps(arrangedDats1+802816*i15+16384*j9+256*s10+(ptrdiff_t)0);
__m512 dat914 = _mm512_loadu_ps(arrangedDats1+802816*i15+16384*j9+256*s10+(ptrdiff_t)64);
__m512 dat915 = _mm512_loadu_ps(arrangedDats1+802816*i15+16384*j9+256*s10+(ptrdiff_t)128);
__m512 dat916 = _mm512_loadu_ps(arrangedDats1+802816*i15+16384*j9+256*s10+(ptrdiff_t)192);
__m512 wt79 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)24));
sum6 = _mm512_fmadd_ps(wt79, dat913, sum6);
sum7 = _mm512_fmadd_ps(wt79, dat914, sum7);
sum8 = _mm512_fmadd_ps(wt79, dat915, sum8);
sum9 = _mm512_fmadd_ps(wt79, dat916, sum9);
__m512 wt80 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)28));
sum10 = _mm512_fmadd_ps(wt80, dat913, sum10);
sum11 = _mm512_fmadd_ps(wt80, dat914, sum11);
sum12 = _mm512_fmadd_ps(wt80, dat915, sum12);
sum13 = _mm512_fmadd_ps(wt80, dat916, sum13);
__m512 wt81 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)32));
sum14 = _mm512_fmadd_ps(wt81, dat913, sum14);
sum15 = _mm512_fmadd_ps(wt81, dat914, sum15);
sum16 = _mm512_fmadd_ps(wt81, dat915, sum16);
sum17 = _mm512_fmadd_ps(wt81, dat916, sum17);
__m512 wt82 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)36));
sum18 = _mm512_fmadd_ps(wt82, dat913, sum18);
sum19 = _mm512_fmadd_ps(wt82, dat914, sum19);
sum20 = _mm512_fmadd_ps(wt82, dat915, sum20);
sum21 = _mm512_fmadd_ps(wt82, dat916, sum21);
__m512 wt83 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)40));
sum22 = _mm512_fmadd_ps(wt83, dat913, sum22);
sum23 = _mm512_fmadd_ps(wt83, dat914, sum23);
sum24 = _mm512_fmadd_ps(wt83, dat915, sum24);
sum25 = _mm512_fmadd_ps(wt83, dat916, sum25);
__m512 wt84 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)44));
sum26 = _mm512_fmadd_ps(wt84, dat913, sum26);
sum27 = _mm512_fmadd_ps(wt84, dat914, sum27);
sum28 = _mm512_fmadd_ps(wt84, dat915, sum28);
sum29 = _mm512_fmadd_ps(wt84, dat916, sum29);
}
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)0, 65535, sum6);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)64, 65535, sum7);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)128, 65535, sum8);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)192, 65535, sum9);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)12608, 65535, sum10);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)12672, 65535, sum11);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)12736, 65535, sum12);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)12800, 65535, sum13);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)25216, 65535, sum14);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)25280, 65535, sum15);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)25344, 65535, sum16);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)25408, 65535, sum17);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)37824, 65535, sum18);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)37888, 65535, sum19);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)37952, 65535, sum20);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)38016, 65535, sum21);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)50432, 65535, sum22);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)50496, 65535, sum23);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)50560, 65535, sum24);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)50624, 65535, sum25);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)63040, 65535, sum26);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)63104, 65535, sum27);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)63168, 65535, sum28);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)63232, 65535, sum29);
if (k49 >= kk25) return;
}
if (j9 >= jj21) return;
}
}
}

static void ResNeXt50OneApply1(ResNeXt50ThreaderTeam1* team21, char** tensors15) {
void* pair1[] = {tensors15, 0};
ResNeXt50ThreaderTask1 task19;
task19.callee1 = ResNeXt50OneApply1Callee1;
task19.any1 = pair1;
task19.nd1 = 3;
task19.hull1[0] = 8;
task19.hull1[1] = 49;
task19.hull1[2] = 1;
ResNeXt50ThreaderDo1(team21, &task19);
}

static void ResNeXt50OneArrangeWts2Callee1(ResNeXt50ThreaderTask1* task28, int64_t* pt19) {
char** tensors26 = task28->any1;
ptrdiff_t b48 = pt19[0];
char*restrict wtPtr5 = tensors26[0]+(ptrdiff_t)3340*0+(ptrdiff_t)131072*0;
char*restrict biasPtr5 = tensors26[1]+(ptrdiff_t)1024*0;
char*restrict bnPtr6 = tensors26[2]+(ptrdiff_t)8*256*0;
char*restrict arranged3 = tensors26[3]+(ptrdiff_t)856064*0+(ptrdiff_t)132096*0;
ptrdiff_t ii8 = 1;
for (ptrdiff_t i20 = 0; i20 < ii8; ++i20) {
ptrdiff_t j14 = 4*b48;
ptrdiff_t jj23 = j14+4;
for (; j14 < jj23; ++j14) {
if (j14 < 15) {
ptrdiff_t k71 = 0+16*(j14-0);
ptrdiff_t l23 = (size_t)(0+k71)/6;
ptrdiff_t cut5 = (size_t)(0+k71)%6;
switch (cut5) {
case 0:;
case 2: {
__m512 sum71 = _mm512_maskz_loadu_ps(65535, biasPtr5+1024*i20+4*k71);
__m512i pmMul7 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd7 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo5 = _mm512_loadu_ps(bnPtr6+(ptrdiff_t)8*(k71+256*i20));
__m512 masHi5 = _mm512_maskz_loadu_ps(65535, bnPtr6+(ptrdiff_t)8*(k71+256*i20)+(ptrdiff_t)64);
__m512 postMul14 = _mm512_permutex2var_ps(masLo5, pmMul7, masHi5);
__m512 postAdd8 = _mm512_permutex2var_ps(masLo5, pmAdd7, masHi5);
sum71 = _mm512_fmadd_ps(sum71, postMul14, postAdd8);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*0+(ptrdiff_t)0, 63>>cut5, sum71);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*0+(ptrdiff_t)3072, 4032>>cut5, sum71);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*0+(ptrdiff_t)6144, 65535-(4095>>cut5), sum71);
ptrdiff_t c12 = 0;
for (; c12 != 8; ++c12) {
__m512 wt105 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)0);
__m512 wt106 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)512);
__m512 wt107 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)1024);
__m512 wt108 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)1536);
__m512 wt109 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)2048);
__m512 wt110 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)2560);
__m512 wt111 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)3072);
__m512 wt112 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)3584);
__m512 wt113 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)4096);
__m512 wt114 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)4608);
__m512 wt115 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)5120);
__m512 wt116 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)5632);
__m512 wt117 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)6144);
__m512 wt118 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)6656);
__m512 wt119 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)7168);
__m512 wt120 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)7680);
__m512 tmp5205 = _mm512_unpacklo_ps(wt105, wt106);
__m512 tmp5206 = _mm512_unpackhi_ps(wt105, wt106);
__m512 tmp5207 = _mm512_unpacklo_ps(wt107, wt108);
__m512 tmp5208 = _mm512_unpackhi_ps(wt107, wt108);
__m512 tmp5209 = _mm512_unpacklo_ps(wt109, wt110);
__m512 tmp5210 = _mm512_unpackhi_ps(wt109, wt110);
__m512 tmp5211 = _mm512_unpacklo_ps(wt111, wt112);
__m512 tmp5212 = _mm512_unpackhi_ps(wt111, wt112);
__m512 tmp5213 = _mm512_unpacklo_ps(wt113, wt114);
__m512 tmp5214 = _mm512_unpackhi_ps(wt113, wt114);
__m512 tmp5215 = _mm512_unpacklo_ps(wt115, wt116);
__m512 tmp5216 = _mm512_unpackhi_ps(wt115, wt116);
__m512 tmp5217 = _mm512_unpacklo_ps(wt117, wt118);
__m512 tmp5218 = _mm512_unpackhi_ps(wt117, wt118);
__m512 tmp5219 = _mm512_unpacklo_ps(wt119, wt120);
__m512 tmp5220 = _mm512_unpackhi_ps(wt119, wt120);
__m512 tmp5221 = _mm512_shuffle_ps(tmp5205, tmp5207, 68);
__m512 tmp5222 = _mm512_shuffle_ps(tmp5205, tmp5207, 238);
__m512 tmp5223 = _mm512_shuffle_ps(tmp5206, tmp5208, 68);
__m512 tmp5224 = _mm512_shuffle_ps(tmp5206, tmp5208, 238);
__m512 tmp5225 = _mm512_shuffle_ps(tmp5209, tmp5211, 68);
__m512 tmp5226 = _mm512_shuffle_ps(tmp5209, tmp5211, 238);
__m512 tmp5227 = _mm512_shuffle_ps(tmp5210, tmp5212, 68);
__m512 tmp5228 = _mm512_shuffle_ps(tmp5210, tmp5212, 238);
__m512 tmp5229 = _mm512_shuffle_ps(tmp5213, tmp5215, 68);
__m512 tmp5230 = _mm512_shuffle_ps(tmp5213, tmp5215, 238);
__m512 tmp5231 = _mm512_shuffle_ps(tmp5214, tmp5216, 68);
__m512 tmp5232 = _mm512_shuffle_ps(tmp5214, tmp5216, 238);
__m512 tmp5233 = _mm512_shuffle_ps(tmp5217, tmp5219, 68);
__m512 tmp5234 = _mm512_shuffle_ps(tmp5217, tmp5219, 238);
__m512 tmp5235 = _mm512_shuffle_ps(tmp5218, tmp5220, 68);
__m512 tmp5236 = _mm512_shuffle_ps(tmp5218, tmp5220, 238);
__m512 tmp5237 = _mm512_shuffle_f32x4(tmp5221, tmp5225, 136);
__m512 tmp5238 = _mm512_shuffle_f32x4(tmp5221, tmp5225, 221);
__m512 tmp5239 = _mm512_shuffle_f32x4(tmp5222, tmp5226, 136);
__m512 tmp5240 = _mm512_shuffle_f32x4(tmp5222, tmp5226, 221);
__m512 tmp5241 = _mm512_shuffle_f32x4(tmp5223, tmp5227, 136);
__m512 tmp5242 = _mm512_shuffle_f32x4(tmp5223, tmp5227, 221);
__m512 tmp5243 = _mm512_shuffle_f32x4(tmp5224, tmp5228, 136);
__m512 tmp5244 = _mm512_shuffle_f32x4(tmp5224, tmp5228, 221);
__m512 tmp5245 = _mm512_shuffle_f32x4(tmp5229, tmp5233, 136);
__m512 tmp5246 = _mm512_shuffle_f32x4(tmp5229, tmp5233, 221);
__m512 tmp5247 = _mm512_shuffle_f32x4(tmp5230, tmp5234, 136);
__m512 tmp5248 = _mm512_shuffle_f32x4(tmp5230, tmp5234, 221);
__m512 tmp5249 = _mm512_shuffle_f32x4(tmp5231, tmp5235, 136);
__m512 tmp5250 = _mm512_shuffle_f32x4(tmp5231, tmp5235, 221);
__m512 tmp5251 = _mm512_shuffle_f32x4(tmp5232, tmp5236, 136);
__m512 tmp5252 = _mm512_shuffle_f32x4(tmp5232, tmp5236, 221);
wt105 = _mm512_shuffle_f32x4(tmp5237, tmp5245, 136);
wt113 = _mm512_shuffle_f32x4(tmp5237, tmp5245, 221);
wt106 = _mm512_shuffle_f32x4(tmp5239, tmp5247, 136);
wt114 = _mm512_shuffle_f32x4(tmp5239, tmp5247, 221);
wt107 = _mm512_shuffle_f32x4(tmp5241, tmp5249, 136);
wt115 = _mm512_shuffle_f32x4(tmp5241, tmp5249, 221);
wt108 = _mm512_shuffle_f32x4(tmp5243, tmp5251, 136);
wt116 = _mm512_shuffle_f32x4(tmp5243, tmp5251, 221);
wt109 = _mm512_shuffle_f32x4(tmp5238, tmp5246, 136);
wt117 = _mm512_shuffle_f32x4(tmp5238, tmp5246, 221);
wt110 = _mm512_shuffle_f32x4(tmp5240, tmp5248, 136);
wt118 = _mm512_shuffle_f32x4(tmp5240, tmp5248, 221);
wt111 = _mm512_shuffle_f32x4(tmp5242, tmp5250, 136);
wt119 = _mm512_shuffle_f32x4(tmp5242, tmp5250, 221);
wt112 = _mm512_shuffle_f32x4(tmp5244, tmp5252, 136);
wt120 = _mm512_shuffle_f32x4(tmp5244, tmp5252, 221);
wt105 = _mm512_mul_ps(wt105, postMul14);
wt106 = _mm512_mul_ps(wt106, postMul14);
wt107 = _mm512_mul_ps(wt107, postMul14);
wt108 = _mm512_mul_ps(wt108, postMul14);
wt109 = _mm512_mul_ps(wt109, postMul14);
wt110 = _mm512_mul_ps(wt110, postMul14);
wt111 = _mm512_mul_ps(wt111, postMul14);
wt112 = _mm512_mul_ps(wt112, postMul14);
wt113 = _mm512_mul_ps(wt113, postMul14);
wt114 = _mm512_mul_ps(wt114, postMul14);
wt115 = _mm512_mul_ps(wt115, postMul14);
wt116 = _mm512_mul_ps(wt116, postMul14);
wt117 = _mm512_mul_ps(wt117, postMul14);
wt118 = _mm512_mul_ps(wt118, postMul14);
wt119 = _mm512_mul_ps(wt119, postMul14);
wt120 = _mm512_mul_ps(wt120, postMul14);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(1+16*c12)+(ptrdiff_t)0, 63>>cut5, wt105);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(2+16*c12)+(ptrdiff_t)0, 63>>cut5, wt106);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(3+16*c12)+(ptrdiff_t)0, 63>>cut5, wt107);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(4+16*c12)+(ptrdiff_t)0, 63>>cut5, wt108);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(5+16*c12)+(ptrdiff_t)0, 63>>cut5, wt109);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(6+16*c12)+(ptrdiff_t)0, 63>>cut5, wt110);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(7+16*c12)+(ptrdiff_t)0, 63>>cut5, wt111);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(8+16*c12)+(ptrdiff_t)0, 63>>cut5, wt112);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(9+16*c12)+(ptrdiff_t)0, 63>>cut5, wt113);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(10+16*c12)+(ptrdiff_t)0, 63>>cut5, wt114);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(11+16*c12)+(ptrdiff_t)0, 63>>cut5, wt115);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(12+16*c12)+(ptrdiff_t)0, 63>>cut5, wt116);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(13+16*c12)+(ptrdiff_t)0, 63>>cut5, wt117);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(14+16*c12)+(ptrdiff_t)0, 63>>cut5, wt118);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(15+16*c12)+(ptrdiff_t)0, 63>>cut5, wt119);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(16+16*c12)+(ptrdiff_t)0, 63>>cut5, wt120);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(1+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt105);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(2+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt106);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(3+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt107);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(4+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt108);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(5+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt109);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(6+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt110);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(7+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt111);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(8+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt112);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(9+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt113);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(10+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt114);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(11+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt115);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(12+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt116);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(13+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt117);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(14+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt118);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(15+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt119);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(16+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt120);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(1+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt105);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(2+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt106);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(3+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt107);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(4+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt108);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(5+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt109);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(6+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt110);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(7+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt111);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(8+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt112);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(9+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt113);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(10+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt114);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(11+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt115);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(12+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt116);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(13+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt117);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(14+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt118);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(15+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt119);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(16+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt120);
}
break;
}
default: {
cut5 = 4;
__m512 sum72 = _mm512_maskz_loadu_ps(65535, biasPtr5+1024*i20+4*k71);
__m512i pmMul8 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd8 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo6 = _mm512_loadu_ps(bnPtr6+(ptrdiff_t)8*(k71+256*i20));
__m512 masHi6 = _mm512_maskz_loadu_ps(65535, bnPtr6+(ptrdiff_t)8*(k71+256*i20)+(ptrdiff_t)64);
__m512 postMul15 = _mm512_permutex2var_ps(masLo6, pmMul8, masHi6);
__m512 postAdd9 = _mm512_permutex2var_ps(masLo6, pmAdd8, masHi6);
sum72 = _mm512_fmadd_ps(sum72, postMul15, postAdd9);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*0+(ptrdiff_t)0, 63>>cut5, sum72);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*0+(ptrdiff_t)3072, 4032>>cut5, sum72);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*0+(ptrdiff_t)6144, 258048>>cut5, sum72);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*0+(ptrdiff_t)9216, 65535-(262143>>cut5), sum72);
ptrdiff_t c13 = 0;
for (; c13 != 8; ++c13) {
__m512 wt121 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)0);
__m512 wt122 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)512);
__m512 wt123 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)1024);
__m512 wt124 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)1536);
__m512 wt125 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)2048);
__m512 wt126 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)2560);
__m512 wt127 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)3072);
__m512 wt128 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)3584);
__m512 wt129 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)4096);
__m512 wt130 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)4608);
__m512 wt131 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)5120);
__m512 wt132 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)5632);
__m512 wt133 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)6144);
__m512 wt134 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)6656);
__m512 wt135 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)7168);
__m512 wt136 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)7680);
__m512 tmp5253 = _mm512_unpacklo_ps(wt121, wt122);
__m512 tmp5254 = _mm512_unpackhi_ps(wt121, wt122);
__m512 tmp5255 = _mm512_unpacklo_ps(wt123, wt124);
__m512 tmp5256 = _mm512_unpackhi_ps(wt123, wt124);
__m512 tmp5257 = _mm512_unpacklo_ps(wt125, wt126);
__m512 tmp5258 = _mm512_unpackhi_ps(wt125, wt126);
__m512 tmp5259 = _mm512_unpacklo_ps(wt127, wt128);
__m512 tmp5260 = _mm512_unpackhi_ps(wt127, wt128);
__m512 tmp5261 = _mm512_unpacklo_ps(wt129, wt130);
__m512 tmp5262 = _mm512_unpackhi_ps(wt129, wt130);
__m512 tmp5263 = _mm512_unpacklo_ps(wt131, wt132);
__m512 tmp5264 = _mm512_unpackhi_ps(wt131, wt132);
__m512 tmp5265 = _mm512_unpacklo_ps(wt133, wt134);
__m512 tmp5266 = _mm512_unpackhi_ps(wt133, wt134);
__m512 tmp5267 = _mm512_unpacklo_ps(wt135, wt136);
__m512 tmp5268 = _mm512_unpackhi_ps(wt135, wt136);
__m512 tmp5269 = _mm512_shuffle_ps(tmp5253, tmp5255, 68);
__m512 tmp5270 = _mm512_shuffle_ps(tmp5253, tmp5255, 238);
__m512 tmp5271 = _mm512_shuffle_ps(tmp5254, tmp5256, 68);
__m512 tmp5272 = _mm512_shuffle_ps(tmp5254, tmp5256, 238);
__m512 tmp5273 = _mm512_shuffle_ps(tmp5257, tmp5259, 68);
__m512 tmp5274 = _mm512_shuffle_ps(tmp5257, tmp5259, 238);
__m512 tmp5275 = _mm512_shuffle_ps(tmp5258, tmp5260, 68);
__m512 tmp5276 = _mm512_shuffle_ps(tmp5258, tmp5260, 238);
__m512 tmp5277 = _mm512_shuffle_ps(tmp5261, tmp5263, 68);
__m512 tmp5278 = _mm512_shuffle_ps(tmp5261, tmp5263, 238);
__m512 tmp5279 = _mm512_shuffle_ps(tmp5262, tmp5264, 68);
__m512 tmp5280 = _mm512_shuffle_ps(tmp5262, tmp5264, 238);
__m512 tmp5281 = _mm512_shuffle_ps(tmp5265, tmp5267, 68);
__m512 tmp5282 = _mm512_shuffle_ps(tmp5265, tmp5267, 238);
__m512 tmp5283 = _mm512_shuffle_ps(tmp5266, tmp5268, 68);
__m512 tmp5284 = _mm512_shuffle_ps(tmp5266, tmp5268, 238);
__m512 tmp5285 = _mm512_shuffle_f32x4(tmp5269, tmp5273, 136);
__m512 tmp5286 = _mm512_shuffle_f32x4(tmp5269, tmp5273, 221);
__m512 tmp5287 = _mm512_shuffle_f32x4(tmp5270, tmp5274, 136);
__m512 tmp5288 = _mm512_shuffle_f32x4(tmp5270, tmp5274, 221);
__m512 tmp5289 = _mm512_shuffle_f32x4(tmp5271, tmp5275, 136);
__m512 tmp5290 = _mm512_shuffle_f32x4(tmp5271, tmp5275, 221);
__m512 tmp5291 = _mm512_shuffle_f32x4(tmp5272, tmp5276, 136);
__m512 tmp5292 = _mm512_shuffle_f32x4(tmp5272, tmp5276, 221);
__m512 tmp5293 = _mm512_shuffle_f32x4(tmp5277, tmp5281, 136);
__m512 tmp5294 = _mm512_shuffle_f32x4(tmp5277, tmp5281, 221);
__m512 tmp5295 = _mm512_shuffle_f32x4(tmp5278, tmp5282, 136);
__m512 tmp5296 = _mm512_shuffle_f32x4(tmp5278, tmp5282, 221);
__m512 tmp5297 = _mm512_shuffle_f32x4(tmp5279, tmp5283, 136);
__m512 tmp5298 = _mm512_shuffle_f32x4(tmp5279, tmp5283, 221);
__m512 tmp5299 = _mm512_shuffle_f32x4(tmp5280, tmp5284, 136);
__m512 tmp5300 = _mm512_shuffle_f32x4(tmp5280, tmp5284, 221);
wt121 = _mm512_shuffle_f32x4(tmp5285, tmp5293, 136);
wt129 = _mm512_shuffle_f32x4(tmp5285, tmp5293, 221);
wt122 = _mm512_shuffle_f32x4(tmp5287, tmp5295, 136);
wt130 = _mm512_shuffle_f32x4(tmp5287, tmp5295, 221);
wt123 = _mm512_shuffle_f32x4(tmp5289, tmp5297, 136);
wt131 = _mm512_shuffle_f32x4(tmp5289, tmp5297, 221);
wt124 = _mm512_shuffle_f32x4(tmp5291, tmp5299, 136);
wt132 = _mm512_shuffle_f32x4(tmp5291, tmp5299, 221);
wt125 = _mm512_shuffle_f32x4(tmp5286, tmp5294, 136);
wt133 = _mm512_shuffle_f32x4(tmp5286, tmp5294, 221);
wt126 = _mm512_shuffle_f32x4(tmp5288, tmp5296, 136);
wt134 = _mm512_shuffle_f32x4(tmp5288, tmp5296, 221);
wt127 = _mm512_shuffle_f32x4(tmp5290, tmp5298, 136);
wt135 = _mm512_shuffle_f32x4(tmp5290, tmp5298, 221);
wt128 = _mm512_shuffle_f32x4(tmp5292, tmp5300, 136);
wt136 = _mm512_shuffle_f32x4(tmp5292, tmp5300, 221);
wt121 = _mm512_mul_ps(wt121, postMul15);
wt122 = _mm512_mul_ps(wt122, postMul15);
wt123 = _mm512_mul_ps(wt123, postMul15);
wt124 = _mm512_mul_ps(wt124, postMul15);
wt125 = _mm512_mul_ps(wt125, postMul15);
wt126 = _mm512_mul_ps(wt126, postMul15);
wt127 = _mm512_mul_ps(wt127, postMul15);
wt128 = _mm512_mul_ps(wt128, postMul15);
wt129 = _mm512_mul_ps(wt129, postMul15);
wt130 = _mm512_mul_ps(wt130, postMul15);
wt131 = _mm512_mul_ps(wt131, postMul15);
wt132 = _mm512_mul_ps(wt132, postMul15);
wt133 = _mm512_mul_ps(wt133, postMul15);
wt134 = _mm512_mul_ps(wt134, postMul15);
wt135 = _mm512_mul_ps(wt135, postMul15);
wt136 = _mm512_mul_ps(wt136, postMul15);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(1+16*c13)+(ptrdiff_t)0, 63>>cut5, wt121);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(2+16*c13)+(ptrdiff_t)0, 63>>cut5, wt122);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(3+16*c13)+(ptrdiff_t)0, 63>>cut5, wt123);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(4+16*c13)+(ptrdiff_t)0, 63>>cut5, wt124);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(5+16*c13)+(ptrdiff_t)0, 63>>cut5, wt125);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(6+16*c13)+(ptrdiff_t)0, 63>>cut5, wt126);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(7+16*c13)+(ptrdiff_t)0, 63>>cut5, wt127);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(8+16*c13)+(ptrdiff_t)0, 63>>cut5, wt128);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(9+16*c13)+(ptrdiff_t)0, 63>>cut5, wt129);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(10+16*c13)+(ptrdiff_t)0, 63>>cut5, wt130);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(11+16*c13)+(ptrdiff_t)0, 63>>cut5, wt131);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(12+16*c13)+(ptrdiff_t)0, 63>>cut5, wt132);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(13+16*c13)+(ptrdiff_t)0, 63>>cut5, wt133);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(14+16*c13)+(ptrdiff_t)0, 63>>cut5, wt134);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(15+16*c13)+(ptrdiff_t)0, 63>>cut5, wt135);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(16+16*c13)+(ptrdiff_t)0, 63>>cut5, wt136);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(1+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt121);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(2+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt122);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(3+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt123);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(4+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt124);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(5+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt125);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(6+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt126);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(7+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt127);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(8+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt128);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(9+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt129);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(10+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt130);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(11+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt131);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(12+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt132);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(13+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt133);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(14+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt134);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(15+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt135);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(16+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt136);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(1+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt121);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(2+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt122);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(3+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt123);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(4+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt124);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(5+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt125);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(6+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt126);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(7+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt127);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(8+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt128);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(9+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt129);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(10+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt130);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(11+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt131);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(12+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt132);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(13+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt133);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(14+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt134);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(15+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt135);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(16+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt136);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(1+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt121);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(2+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt122);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(3+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt123);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(4+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt124);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(5+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt125);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(6+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt126);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(7+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt127);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(8+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt128);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(9+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt129);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(10+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt130);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(11+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt131);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(12+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt132);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(13+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt133);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(14+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt134);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(15+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt135);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(16+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt136);
}
}
}
} else {
ptrdiff_t k70 = 240;
ptrdiff_t l22 = (size_t)(0+k70)/6;
ptrdiff_t cut4 = (size_t)(0+k70)%6;
__m512 sum70 = _mm512_maskz_loadu_ps(65535, biasPtr5+1024*i20+4*k70);
__m512i pmMul9 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd9 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo7 = _mm512_loadu_ps(bnPtr6+(ptrdiff_t)8*(k70+256*i20));
__m512 masHi7 = _mm512_maskz_loadu_ps(65535, bnPtr6+(ptrdiff_t)8*(k70+256*i20)+(ptrdiff_t)64);
__m512 postMul13 = _mm512_permutex2var_ps(masLo7, pmMul9, masHi7);
__m512 postAdd7 = _mm512_permutex2var_ps(masLo7, pmAdd9, masHi7);
sum70 = _mm512_fmadd_ps(sum70, postMul13, postAdd7);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*0+(ptrdiff_t)0, 63>>cut4, sum70);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*0+(ptrdiff_t)3072, 4032>>cut4, sum70);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*0+(ptrdiff_t)6144, 65535-(4095>>cut4), sum70);
ptrdiff_t c11 = 0;
for (; c11 != 8; ++c11) {
__m512 wt89 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)0);
__m512 wt90 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)512);
__m512 wt91 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)1024);
__m512 wt92 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)1536);
__m512 wt93 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)2048);
__m512 wt94 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)2560);
__m512 wt95 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)3072);
__m512 wt96 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)3584);
__m512 wt97 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)4096);
__m512 wt98 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)4608);
__m512 wt99 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)5120);
__m512 wt100 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)5632);
__m512 wt101 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)6144);
__m512 wt102 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)6656);
__m512 wt103 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)7168);
__m512 wt104 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)7680);
__m512 tmp5301 = _mm512_unpacklo_ps(wt89, wt90);
__m512 tmp5302 = _mm512_unpackhi_ps(wt89, wt90);
__m512 tmp5303 = _mm512_unpacklo_ps(wt91, wt92);
__m512 tmp5304 = _mm512_unpackhi_ps(wt91, wt92);
__m512 tmp5305 = _mm512_unpacklo_ps(wt93, wt94);
__m512 tmp5306 = _mm512_unpackhi_ps(wt93, wt94);
__m512 tmp5307 = _mm512_unpacklo_ps(wt95, wt96);
__m512 tmp5308 = _mm512_unpackhi_ps(wt95, wt96);
__m512 tmp5309 = _mm512_unpacklo_ps(wt97, wt98);
__m512 tmp5310 = _mm512_unpackhi_ps(wt97, wt98);
__m512 tmp5311 = _mm512_unpacklo_ps(wt99, wt100);
__m512 tmp5312 = _mm512_unpackhi_ps(wt99, wt100);
__m512 tmp5313 = _mm512_unpacklo_ps(wt101, wt102);
__m512 tmp5314 = _mm512_unpackhi_ps(wt101, wt102);
__m512 tmp5315 = _mm512_unpacklo_ps(wt103, wt104);
__m512 tmp5316 = _mm512_unpackhi_ps(wt103, wt104);
__m512 tmp5317 = _mm512_shuffle_ps(tmp5301, tmp5303, 68);
__m512 tmp5318 = _mm512_shuffle_ps(tmp5301, tmp5303, 238);
__m512 tmp5319 = _mm512_shuffle_ps(tmp5302, tmp5304, 68);
__m512 tmp5320 = _mm512_shuffle_ps(tmp5302, tmp5304, 238);
__m512 tmp5321 = _mm512_shuffle_ps(tmp5305, tmp5307, 68);
__m512 tmp5322 = _mm512_shuffle_ps(tmp5305, tmp5307, 238);
__m512 tmp5323 = _mm512_shuffle_ps(tmp5306, tmp5308, 68);
__m512 tmp5324 = _mm512_shuffle_ps(tmp5306, tmp5308, 238);
__m512 tmp5325 = _mm512_shuffle_ps(tmp5309, tmp5311, 68);
__m512 tmp5326 = _mm512_shuffle_ps(tmp5309, tmp5311, 238);
__m512 tmp5327 = _mm512_shuffle_ps(tmp5310, tmp5312, 68);
__m512 tmp5328 = _mm512_shuffle_ps(tmp5310, tmp5312, 238);
__m512 tmp5329 = _mm512_shuffle_ps(tmp5313, tmp5315, 68);
__m512 tmp5330 = _mm512_shuffle_ps(tmp5313, tmp5315, 238);
__m512 tmp5331 = _mm512_shuffle_ps(tmp5314, tmp5316, 68);
__m512 tmp5332 = _mm512_shuffle_ps(tmp5314, tmp5316, 238);
__m512 tmp5333 = _mm512_shuffle_f32x4(tmp5317, tmp5321, 136);
__m512 tmp5334 = _mm512_shuffle_f32x4(tmp5317, tmp5321, 221);
__m512 tmp5335 = _mm512_shuffle_f32x4(tmp5318, tmp5322, 136);
__m512 tmp5336 = _mm512_shuffle_f32x4(tmp5318, tmp5322, 221);
__m512 tmp5337 = _mm512_shuffle_f32x4(tmp5319, tmp5323, 136);
__m512 tmp5338 = _mm512_shuffle_f32x4(tmp5319, tmp5323, 221);
__m512 tmp5339 = _mm512_shuffle_f32x4(tmp5320, tmp5324, 136);
__m512 tmp5340 = _mm512_shuffle_f32x4(tmp5320, tmp5324, 221);
__m512 tmp5341 = _mm512_shuffle_f32x4(tmp5325, tmp5329, 136);
__m512 tmp5342 = _mm512_shuffle_f32x4(tmp5325, tmp5329, 221);
__m512 tmp5343 = _mm512_shuffle_f32x4(tmp5326, tmp5330, 136);
__m512 tmp5344 = _mm512_shuffle_f32x4(tmp5326, tmp5330, 221);
__m512 tmp5345 = _mm512_shuffle_f32x4(tmp5327, tmp5331, 136);
__m512 tmp5346 = _mm512_shuffle_f32x4(tmp5327, tmp5331, 221);
__m512 tmp5347 = _mm512_shuffle_f32x4(tmp5328, tmp5332, 136);
__m512 tmp5348 = _mm512_shuffle_f32x4(tmp5328, tmp5332, 221);
wt89 = _mm512_shuffle_f32x4(tmp5333, tmp5341, 136);
wt97 = _mm512_shuffle_f32x4(tmp5333, tmp5341, 221);
wt90 = _mm512_shuffle_f32x4(tmp5335, tmp5343, 136);
wt98 = _mm512_shuffle_f32x4(tmp5335, tmp5343, 221);
wt91 = _mm512_shuffle_f32x4(tmp5337, tmp5345, 136);
wt99 = _mm512_shuffle_f32x4(tmp5337, tmp5345, 221);
wt92 = _mm512_shuffle_f32x4(tmp5339, tmp5347, 136);
wt100 = _mm512_shuffle_f32x4(tmp5339, tmp5347, 221);
wt93 = _mm512_shuffle_f32x4(tmp5334, tmp5342, 136);
wt101 = _mm512_shuffle_f32x4(tmp5334, tmp5342, 221);
wt94 = _mm512_shuffle_f32x4(tmp5336, tmp5344, 136);
wt102 = _mm512_shuffle_f32x4(tmp5336, tmp5344, 221);
wt95 = _mm512_shuffle_f32x4(tmp5338, tmp5346, 136);
wt103 = _mm512_shuffle_f32x4(tmp5338, tmp5346, 221);
wt96 = _mm512_shuffle_f32x4(tmp5340, tmp5348, 136);
wt104 = _mm512_shuffle_f32x4(tmp5340, tmp5348, 221);
wt89 = _mm512_mul_ps(wt89, postMul13);
wt90 = _mm512_mul_ps(wt90, postMul13);
wt91 = _mm512_mul_ps(wt91, postMul13);
wt92 = _mm512_mul_ps(wt92, postMul13);
wt93 = _mm512_mul_ps(wt93, postMul13);
wt94 = _mm512_mul_ps(wt94, postMul13);
wt95 = _mm512_mul_ps(wt95, postMul13);
wt96 = _mm512_mul_ps(wt96, postMul13);
wt97 = _mm512_mul_ps(wt97, postMul13);
wt98 = _mm512_mul_ps(wt98, postMul13);
wt99 = _mm512_mul_ps(wt99, postMul13);
wt100 = _mm512_mul_ps(wt100, postMul13);
wt101 = _mm512_mul_ps(wt101, postMul13);
wt102 = _mm512_mul_ps(wt102, postMul13);
wt103 = _mm512_mul_ps(wt103, postMul13);
wt104 = _mm512_mul_ps(wt104, postMul13);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(1+16*c11)+(ptrdiff_t)0, 63>>cut4, wt89);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(2+16*c11)+(ptrdiff_t)0, 63>>cut4, wt90);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(3+16*c11)+(ptrdiff_t)0, 63>>cut4, wt91);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(4+16*c11)+(ptrdiff_t)0, 63>>cut4, wt92);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(5+16*c11)+(ptrdiff_t)0, 63>>cut4, wt93);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(6+16*c11)+(ptrdiff_t)0, 63>>cut4, wt94);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(7+16*c11)+(ptrdiff_t)0, 63>>cut4, wt95);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(8+16*c11)+(ptrdiff_t)0, 63>>cut4, wt96);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(9+16*c11)+(ptrdiff_t)0, 63>>cut4, wt97);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(10+16*c11)+(ptrdiff_t)0, 63>>cut4, wt98);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(11+16*c11)+(ptrdiff_t)0, 63>>cut4, wt99);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(12+16*c11)+(ptrdiff_t)0, 63>>cut4, wt100);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(13+16*c11)+(ptrdiff_t)0, 63>>cut4, wt101);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(14+16*c11)+(ptrdiff_t)0, 63>>cut4, wt102);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(15+16*c11)+(ptrdiff_t)0, 63>>cut4, wt103);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(16+16*c11)+(ptrdiff_t)0, 63>>cut4, wt104);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(1+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt89);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(2+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt90);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(3+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt91);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(4+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt92);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(5+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt93);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(6+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt94);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(7+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt95);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(8+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt96);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(9+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt97);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(10+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt98);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(11+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt99);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(12+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt100);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(13+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt101);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(14+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt102);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(15+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt103);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(16+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt104);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(1+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt89);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(2+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt90);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(3+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt91);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(4+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt92);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(5+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt93);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(6+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt94);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(7+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt95);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(8+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt96);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(9+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt97);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(10+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt98);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(11+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt99);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(12+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt100);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(13+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt101);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(14+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt102);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(15+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt103);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(16+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt104);
}
}
}
}
}

static void ResNeXt50OneArrangeWts2(ResNeXt50ThreaderTeam1* team26, char** tensors25) {
ResNeXt50ThreaderTask1 task29;
task29.callee1 = ResNeXt50OneArrangeWts2Callee1;
task29.any1 = tensors25;
task29.nd1 = 3;
task29.hull1[0] = 4;
task29.hull1[1] = 1;
task29.hull1[2] = 1;
ResNeXt50ThreaderDo1(team26, &task29);
}

static void ResNeXt50OneArrangeDats2Callee1(ResNeXt50ThreaderTask1* task30, int64_t* pt20) {
char** tensors28 = task30->any1;
ptrdiff_t c14 = pt20[1];
char*restrict datPtr7 = tensors28[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)1613824*0;
char*restrict arranged4 = tensors28[1]+(ptrdiff_t)10474240*0+(ptrdiff_t)1605632*0;
ptrdiff_t ii9 = 1;
for (ptrdiff_t i21 = 0; i21 < ii9; ++i21) {
ptrdiff_t j15 = 1*c14;
ptrdiff_t jj24 = j15+0;
for (; j15 != 49; ++j15) {
ptrdiff_t k72 = 0;
ptrdiff_t kk26 = k72+128;
for (; k72 < kk26; ++k72) {
__m512 dat1267 = _mm512_maskz_loadu_ps(65535, datPtr7+1613824*i21+256*j15+12608*k72+(ptrdiff_t)0);
__m512 dat1268 = _mm512_maskz_loadu_ps(65535, datPtr7+1613824*i21+256*j15+12608*k72+(ptrdiff_t)64);
__m512 dat1269 = _mm512_maskz_loadu_ps(65535, datPtr7+1613824*i21+256*j15+12608*k72+(ptrdiff_t)128);
__m512 dat1270 = _mm512_maskz_loadu_ps(65535, datPtr7+1613824*i21+256*j15+12608*k72+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged4+1605632*i21+32768*j15+256*k72+(ptrdiff_t)0, 65535, dat1267);
_mm512_mask_storeu_ps(arranged4+1605632*i21+32768*j15+256*k72+(ptrdiff_t)64, 65535, dat1268);
_mm512_mask_storeu_ps(arranged4+1605632*i21+32768*j15+256*k72+(ptrdiff_t)128, 65535, dat1269);
_mm512_mask_storeu_ps(arranged4+1605632*i21+32768*j15+256*k72+(ptrdiff_t)192, 65535, dat1270);
}
if (j15 >= jj24) goto next2;
}
next2:;
}
}

static void ResNeXt50OneArrangeDats2(ResNeXt50ThreaderTeam1* team27, char** tensors27) {
ResNeXt50ThreaderTask1 task31;
task31.callee1 = ResNeXt50OneArrangeDats2Callee1;
task31.any1 = tensors27;
task31.nd1 = 4;
task31.hull1[0] = 1;
task31.hull1[1] = 49;
task31.hull1[2] = 1;
task31.hull1[3] = 1;
ResNeXt50ThreaderDo1(team27, &task31);
}

static void ResNeXt50OneApply2Callee1(ResNeXt50ThreaderTask1* task32, int64_t* pt21) {
void** pair6 = task32->any1;
char** tensors30 = pair6[0];
ptrdiff_t e10 = 0;
ptrdiff_t g11 = 0;
ptrdiff_t d6 = pt21[1];
ptrdiff_t w34 = pt21[0];
char*restrict arrangedWts2 = tensors30[0]+856064*e10+(ptrdiff_t)132096*1*g11;
char*restrict arrangedDats2 = tensors30[1]+10474240*e10+(ptrdiff_t)1605632*1*g11;
char*restrict datPtr8 = tensors30[2]+(ptrdiff_t)3227648*1*g11;
char*restrict datPtr9 = tensors30[3]+(ptrdiff_t)3227648*1*g11;
ptrdiff_t ii10 = 1;
for (ptrdiff_t i22 = 0; i22 < ii10; ++i22) {
ptrdiff_t j16 = 1*d6;
ptrdiff_t jj25 = j16+0;
for (; j16 != 49; ++j16) {
ptrdiff_t k73 = 4*w34;
ptrdiff_t kk27 = k73+(w34 < 9 ? 3 : 6);
for (; k73 != 42; ++k73) {
ptrdiff_t s13 = -1;
__m512 sum73 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)24));
__m512 sum77 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)28));
__m512 sum81 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)32));
__m512 sum85 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)36));
__m512 sum89 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)40));
__m512 sum93 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)44));
__m512 sum74 = sum73;
__m512 sum75 = sum73;
__m512 sum76 = sum73;
__m512 sum78 = sum77;
__m512 sum79 = sum77;
__m512 sum80 = sum77;
__m512 sum82 = sum81;
__m512 sum83 = sum81;
__m512 sum84 = sum81;
__m512 sum86 = sum85;
__m512 sum87 = sum85;
__m512 sum88 = sum85;
__m512 sum90 = sum89;
__m512 sum91 = sum89;
__m512 sum92 = sum89;
__m512 sum94 = sum93;
__m512 sum95 = sum93;
__m512 sum96 = sum93;
for (s13 = 0; s13 < 128; ++s13) {
__m512 dat1271 = _mm512_loadu_ps(arrangedDats2+1605632*i22+32768*j16+256*s13+(ptrdiff_t)0);
__m512 dat1272 = _mm512_loadu_ps(arrangedDats2+1605632*i22+32768*j16+256*s13+(ptrdiff_t)64);
__m512 dat1273 = _mm512_loadu_ps(arrangedDats2+1605632*i22+32768*j16+256*s13+(ptrdiff_t)128);
__m512 dat1274 = _mm512_loadu_ps(arrangedDats2+1605632*i22+32768*j16+256*s13+(ptrdiff_t)192);
__m512 wt137 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)24));
sum73 = _mm512_fmadd_ps(wt137, dat1271, sum73);
sum74 = _mm512_fmadd_ps(wt137, dat1272, sum74);
sum75 = _mm512_fmadd_ps(wt137, dat1273, sum75);
sum76 = _mm512_fmadd_ps(wt137, dat1274, sum76);
__m512 wt138 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)28));
sum77 = _mm512_fmadd_ps(wt138, dat1271, sum77);
sum78 = _mm512_fmadd_ps(wt138, dat1272, sum78);
sum79 = _mm512_fmadd_ps(wt138, dat1273, sum79);
sum80 = _mm512_fmadd_ps(wt138, dat1274, sum80);
__m512 wt139 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)32));
sum81 = _mm512_fmadd_ps(wt139, dat1271, sum81);
sum82 = _mm512_fmadd_ps(wt139, dat1272, sum82);
sum83 = _mm512_fmadd_ps(wt139, dat1273, sum83);
sum84 = _mm512_fmadd_ps(wt139, dat1274, sum84);
__m512 wt140 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)36));
sum85 = _mm512_fmadd_ps(wt140, dat1271, sum85);
sum86 = _mm512_fmadd_ps(wt140, dat1272, sum86);
sum87 = _mm512_fmadd_ps(wt140, dat1273, sum87);
sum88 = _mm512_fmadd_ps(wt140, dat1274, sum88);
__m512 wt141 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)40));
sum89 = _mm512_fmadd_ps(wt141, dat1271, sum89);
sum90 = _mm512_fmadd_ps(wt141, dat1272, sum90);
sum91 = _mm512_fmadd_ps(wt141, dat1273, sum91);
sum92 = _mm512_fmadd_ps(wt141, dat1274, sum92);
__m512 wt142 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)44));
sum93 = _mm512_fmadd_ps(wt142, dat1271, sum93);
sum94 = _mm512_fmadd_ps(wt142, dat1272, sum94);
sum95 = _mm512_fmadd_ps(wt142, dat1273, sum95);
sum96 = _mm512_fmadd_ps(wt142, dat1274, sum96);
}
sum73 = _mm512_add_ps(sum73, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)0));
sum74 = _mm512_add_ps(sum74, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)64));
sum75 = _mm512_add_ps(sum75, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)128));
sum76 = _mm512_add_ps(sum76, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)192));
sum73 = _mm512_max_ps(_mm512_setzero_ps(), sum73);
sum74 = _mm512_max_ps(_mm512_setzero_ps(), sum74);
sum75 = _mm512_max_ps(_mm512_setzero_ps(), sum75);
sum76 = _mm512_max_ps(_mm512_setzero_ps(), sum76);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)0, 65535, sum73);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)64, 65535, sum74);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)128, 65535, sum75);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)192, 65535, sum76);
sum77 = _mm512_add_ps(sum77, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12608));
sum78 = _mm512_add_ps(sum78, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12672));
sum79 = _mm512_add_ps(sum79, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12736));
sum80 = _mm512_add_ps(sum80, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12800));
sum77 = _mm512_max_ps(_mm512_setzero_ps(), sum77);
sum78 = _mm512_max_ps(_mm512_setzero_ps(), sum78);
sum79 = _mm512_max_ps(_mm512_setzero_ps(), sum79);
sum80 = _mm512_max_ps(_mm512_setzero_ps(), sum80);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12608, 65535, sum77);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12672, 65535, sum78);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12736, 65535, sum79);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12800, 65535, sum80);
sum81 = _mm512_add_ps(sum81, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25216));
sum82 = _mm512_add_ps(sum82, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25280));
sum83 = _mm512_add_ps(sum83, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25344));
sum84 = _mm512_add_ps(sum84, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25408));
sum81 = _mm512_max_ps(_mm512_setzero_ps(), sum81);
sum82 = _mm512_max_ps(_mm512_setzero_ps(), sum82);
sum83 = _mm512_max_ps(_mm512_setzero_ps(), sum83);
sum84 = _mm512_max_ps(_mm512_setzero_ps(), sum84);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25216, 65535, sum81);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25280, 65535, sum82);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25344, 65535, sum83);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25408, 65535, sum84);
sum85 = _mm512_add_ps(sum85, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37824));
sum86 = _mm512_add_ps(sum86, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37888));
sum87 = _mm512_add_ps(sum87, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37952));
sum88 = _mm512_add_ps(sum88, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)38016));
sum85 = _mm512_max_ps(_mm512_setzero_ps(), sum85);
sum86 = _mm512_max_ps(_mm512_setzero_ps(), sum86);
sum87 = _mm512_max_ps(_mm512_setzero_ps(), sum87);
sum88 = _mm512_max_ps(_mm512_setzero_ps(), sum88);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37824, 65535, sum85);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37888, 65535, sum86);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37952, 65535, sum87);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)38016, 65535, sum88);
sum89 = _mm512_add_ps(sum89, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)50432));
sum90 = _mm512_add_ps(sum90, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)50496));
sum91 = _mm512_add_ps(sum91, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)50560));
sum92 = _mm512_add_ps(sum92, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)50624));
sum89 = _mm512_max_ps(_mm512_setzero_ps(), sum89);
sum90 = _mm512_max_ps(_mm512_setzero_ps(), sum90);
sum91 = _mm512_max_ps(_mm512_setzero_ps(), sum91);
sum92 = _mm512_max_ps(_mm512_setzero_ps(), sum92);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)50432, 65535, sum89);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)50496, 65535, sum90);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)50560, 65535, sum91);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)50624, 65535, sum92);
sum93 = _mm512_add_ps(sum93, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)63040));
sum94 = _mm512_add_ps(sum94, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)63104));
sum95 = _mm512_add_ps(sum95, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)63168));
sum96 = _mm512_add_ps(sum96, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)63232));
sum93 = _mm512_max_ps(_mm512_setzero_ps(), sum93);
sum94 = _mm512_max_ps(_mm512_setzero_ps(), sum94);
sum95 = _mm512_max_ps(_mm512_setzero_ps(), sum95);
sum96 = _mm512_max_ps(_mm512_setzero_ps(), sum96);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)63040, 65535, sum93);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)63104, 65535, sum94);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)63168, 65535, sum95);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)63232, 65535, sum96);
if (k73 >= kk27) return;
}
ptrdiff_t s14 = -1;
__m512 sum97 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+16*s14+(ptrdiff_t)16));
__m512 sum101 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+16*s14+(ptrdiff_t)20));
__m512 sum105 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+16*s14+(ptrdiff_t)24));
__m512 sum109 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+16*s14+(ptrdiff_t)28));
__m512 sum98 = sum97;
__m512 sum99 = sum97;
__m512 sum100 = sum97;
__m512 sum102 = sum101;
__m512 sum103 = sum101;
__m512 sum104 = sum101;
__m512 sum106 = sum105;
__m512 sum107 = sum105;
__m512 sum108 = sum105;
__m512 sum110 = sum109;
__m512 sum111 = sum109;
__m512 sum112 = sum109;
for (s14 = 0; s14 < 128; ++s14) {
__m512 dat1275 = _mm512_loadu_ps(arrangedDats2+1605632*i22+32768*j16+256*s14+(ptrdiff_t)0);
__m512 dat1276 = _mm512_loadu_ps(arrangedDats2+1605632*i22+32768*j16+256*s14+(ptrdiff_t)64);
__m512 dat1277 = _mm512_loadu_ps(arrangedDats2+1605632*i22+32768*j16+256*s14+(ptrdiff_t)128);
__m512 dat1278 = _mm512_loadu_ps(arrangedDats2+1605632*i22+32768*j16+256*s14+(ptrdiff_t)192);
__m512 wt143 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+16*s14+(ptrdiff_t)16));
sum97 = _mm512_fmadd_ps(wt143, dat1275, sum97);
sum98 = _mm512_fmadd_ps(wt143, dat1276, sum98);
sum99 = _mm512_fmadd_ps(wt143, dat1277, sum99);
sum100 = _mm512_fmadd_ps(wt143, dat1278, sum100);
__m512 wt144 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+16*s14+(ptrdiff_t)20));
sum101 = _mm512_fmadd_ps(wt144, dat1275, sum101);
sum102 = _mm512_fmadd_ps(wt144, dat1276, sum102);
sum103 = _mm512_fmadd_ps(wt144, dat1277, sum103);
sum104 = _mm512_fmadd_ps(wt144, dat1278, sum104);
__m512 wt145 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+16*s14+(ptrdiff_t)24));
sum105 = _mm512_fmadd_ps(wt145, dat1275, sum105);
sum106 = _mm512_fmadd_ps(wt145, dat1276, sum106);
sum107 = _mm512_fmadd_ps(wt145, dat1277, sum107);
sum108 = _mm512_fmadd_ps(wt145, dat1278, sum108);
__m512 wt146 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+16*s14+(ptrdiff_t)28));
sum109 = _mm512_fmadd_ps(wt146, dat1275, sum109);
sum110 = _mm512_fmadd_ps(wt146, dat1276, sum110);
sum111 = _mm512_fmadd_ps(wt146, dat1277, sum111);
sum112 = _mm512_fmadd_ps(wt146, dat1278, sum112);
}
sum97 = _mm512_add_ps(sum97, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)0));
sum98 = _mm512_add_ps(sum98, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)64));
sum99 = _mm512_add_ps(sum99, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)128));
sum100 = _mm512_add_ps(sum100, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)192));
sum97 = _mm512_max_ps(_mm512_setzero_ps(), sum97);
sum98 = _mm512_max_ps(_mm512_setzero_ps(), sum98);
sum99 = _mm512_max_ps(_mm512_setzero_ps(), sum99);
sum100 = _mm512_max_ps(_mm512_setzero_ps(), sum100);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)0, 65535, sum97);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)64, 65535, sum98);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)128, 65535, sum99);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)192, 65535, sum100);
sum101 = _mm512_add_ps(sum101, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12608));
sum102 = _mm512_add_ps(sum102, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12672));
sum103 = _mm512_add_ps(sum103, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12736));
sum104 = _mm512_add_ps(sum104, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12800));
sum101 = _mm512_max_ps(_mm512_setzero_ps(), sum101);
sum102 = _mm512_max_ps(_mm512_setzero_ps(), sum102);
sum103 = _mm512_max_ps(_mm512_setzero_ps(), sum103);
sum104 = _mm512_max_ps(_mm512_setzero_ps(), sum104);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12608, 65535, sum101);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12672, 65535, sum102);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12736, 65535, sum103);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12800, 65535, sum104);
sum105 = _mm512_add_ps(sum105, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25216));
sum106 = _mm512_add_ps(sum106, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25280));
sum107 = _mm512_add_ps(sum107, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25344));
sum108 = _mm512_add_ps(sum108, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25408));
sum105 = _mm512_max_ps(_mm512_setzero_ps(), sum105);
sum106 = _mm512_max_ps(_mm512_setzero_ps(), sum106);
sum107 = _mm512_max_ps(_mm512_setzero_ps(), sum107);
sum108 = _mm512_max_ps(_mm512_setzero_ps(), sum108);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25216, 65535, sum105);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25280, 65535, sum106);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25344, 65535, sum107);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25408, 65535, sum108);
sum109 = _mm512_add_ps(sum109, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37824));
sum110 = _mm512_add_ps(sum110, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37888));
sum111 = _mm512_add_ps(sum111, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37952));
sum112 = _mm512_add_ps(sum112, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)38016));
sum109 = _mm512_max_ps(_mm512_setzero_ps(), sum109);
sum110 = _mm512_max_ps(_mm512_setzero_ps(), sum110);
sum111 = _mm512_max_ps(_mm512_setzero_ps(), sum111);
sum112 = _mm512_max_ps(_mm512_setzero_ps(), sum112);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37824, 65535, sum109);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37888, 65535, sum110);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37952, 65535, sum111);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)38016, 65535, sum112);
if (j16 >= jj25) return;
}
}
}

static void ResNeXt50OneApply2(ResNeXt50ThreaderTeam1* team28, char** tensors29) {
void* pair5[] = {tensors29, 0};
ResNeXt50ThreaderTask1 task33;
task33.callee1 = ResNeXt50OneApply2Callee1;
task33.any1 = pair5;
task33.nd1 = 3;
task33.hull1[0] = 10;
task33.hull1[1] = 49;
task33.hull1[2] = 1;
ResNeXt50ThreaderDo1(team28, &task33);
}

static void ResNeXt50OneArrangeWts3Callee1(ResNeXt50ThreaderTask1* task34, int64_t* pt22) {
char** tensors32 = task34->any1;
ptrdiff_t b49 = pt22[0];
char*restrict wtPtr6 = tensors32[0]+(ptrdiff_t)3340*0+(ptrdiff_t)131072*0;
char*restrict biasPtr6 = tensors32[1]+(ptrdiff_t)512*0;
char*restrict bnPtr7 = tensors32[2]+(ptrdiff_t)8*128*0;
char*restrict arranged5 = tensors32[3]+(ptrdiff_t)428032*0+(ptrdiff_t)131584*0;
ptrdiff_t ii11 = 1;
for (ptrdiff_t i23 = 0; i23 < ii11; ++i23) {
ptrdiff_t j17 = 2*b49;
ptrdiff_t jj26 = j17+2;
for (; j17 < jj26; ++j17) {
if (j17 < 7) {
ptrdiff_t k75 = 0+16*(j17-0);
ptrdiff_t l25 = (size_t)(0+k75)/6;
ptrdiff_t cut7 = (size_t)(0+k75)%6;
switch (cut7) {
case 0:;
case 2: {
__m512 sum114 = _mm512_maskz_loadu_ps(65535, biasPtr6+512*i23+4*k75);
__m512i pmMul10 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd10 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo8 = _mm512_loadu_ps(bnPtr7+(ptrdiff_t)8*(k75+128*i23));
__m512 masHi8 = _mm512_maskz_loadu_ps(65535, bnPtr7+(ptrdiff_t)8*(k75+128*i23)+(ptrdiff_t)64);
__m512 postMul17 = _mm512_permutex2var_ps(masLo8, pmMul10, masHi8);
__m512 postAdd11 = _mm512_permutex2var_ps(masLo8, pmAdd10, masHi8);
sum114 = _mm512_fmadd_ps(sum114, postMul17, postAdd11);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*0+(ptrdiff_t)0, 63>>cut7, sum114);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*0+(ptrdiff_t)6144, 4032>>cut7, sum114);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*0+(ptrdiff_t)12288, 65535-(4095>>cut7), sum114);
ptrdiff_t c16 = 0;
for (; c16 != 16; ++c16) {
__m512 wt163 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)0);
__m512 wt164 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)1024);
__m512 wt165 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)2048);
__m512 wt166 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)3072);
__m512 wt167 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)4096);
__m512 wt168 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)5120);
__m512 wt169 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)6144);
__m512 wt170 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)7168);
__m512 wt171 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)8192);
__m512 wt172 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)9216);
__m512 wt173 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)10240);
__m512 wt174 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)11264);
__m512 wt175 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)12288);
__m512 wt176 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)13312);
__m512 wt177 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)14336);
__m512 wt178 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)15360);
__m512 tmp5349 = _mm512_unpacklo_ps(wt163, wt164);
__m512 tmp5350 = _mm512_unpackhi_ps(wt163, wt164);
__m512 tmp5351 = _mm512_unpacklo_ps(wt165, wt166);
__m512 tmp5352 = _mm512_unpackhi_ps(wt165, wt166);
__m512 tmp5353 = _mm512_unpacklo_ps(wt167, wt168);
__m512 tmp5354 = _mm512_unpackhi_ps(wt167, wt168);
__m512 tmp5355 = _mm512_unpacklo_ps(wt169, wt170);
__m512 tmp5356 = _mm512_unpackhi_ps(wt169, wt170);
__m512 tmp5357 = _mm512_unpacklo_ps(wt171, wt172);
__m512 tmp5358 = _mm512_unpackhi_ps(wt171, wt172);
__m512 tmp5359 = _mm512_unpacklo_ps(wt173, wt174);
__m512 tmp5360 = _mm512_unpackhi_ps(wt173, wt174);
__m512 tmp5361 = _mm512_unpacklo_ps(wt175, wt176);
__m512 tmp5362 = _mm512_unpackhi_ps(wt175, wt176);
__m512 tmp5363 = _mm512_unpacklo_ps(wt177, wt178);
__m512 tmp5364 = _mm512_unpackhi_ps(wt177, wt178);
__m512 tmp5365 = _mm512_shuffle_ps(tmp5349, tmp5351, 68);
__m512 tmp5366 = _mm512_shuffle_ps(tmp5349, tmp5351, 238);
__m512 tmp5367 = _mm512_shuffle_ps(tmp5350, tmp5352, 68);
__m512 tmp5368 = _mm512_shuffle_ps(tmp5350, tmp5352, 238);
__m512 tmp5369 = _mm512_shuffle_ps(tmp5353, tmp5355, 68);
__m512 tmp5370 = _mm512_shuffle_ps(tmp5353, tmp5355, 238);
__m512 tmp5371 = _mm512_shuffle_ps(tmp5354, tmp5356, 68);
__m512 tmp5372 = _mm512_shuffle_ps(tmp5354, tmp5356, 238);
__m512 tmp5373 = _mm512_shuffle_ps(tmp5357, tmp5359, 68);
__m512 tmp5374 = _mm512_shuffle_ps(tmp5357, tmp5359, 238);
__m512 tmp5375 = _mm512_shuffle_ps(tmp5358, tmp5360, 68);
__m512 tmp5376 = _mm512_shuffle_ps(tmp5358, tmp5360, 238);
__m512 tmp5377 = _mm512_shuffle_ps(tmp5361, tmp5363, 68);
__m512 tmp5378 = _mm512_shuffle_ps(tmp5361, tmp5363, 238);
__m512 tmp5379 = _mm512_shuffle_ps(tmp5362, tmp5364, 68);
__m512 tmp5380 = _mm512_shuffle_ps(tmp5362, tmp5364, 238);
__m512 tmp5381 = _mm512_shuffle_f32x4(tmp5365, tmp5369, 136);
__m512 tmp5382 = _mm512_shuffle_f32x4(tmp5365, tmp5369, 221);
__m512 tmp5383 = _mm512_shuffle_f32x4(tmp5366, tmp5370, 136);
__m512 tmp5384 = _mm512_shuffle_f32x4(tmp5366, tmp5370, 221);
__m512 tmp5385 = _mm512_shuffle_f32x4(tmp5367, tmp5371, 136);
__m512 tmp5386 = _mm512_shuffle_f32x4(tmp5367, tmp5371, 221);
__m512 tmp5387 = _mm512_shuffle_f32x4(tmp5368, tmp5372, 136);
__m512 tmp5388 = _mm512_shuffle_f32x4(tmp5368, tmp5372, 221);
__m512 tmp5389 = _mm512_shuffle_f32x4(tmp5373, tmp5377, 136);
__m512 tmp5390 = _mm512_shuffle_f32x4(tmp5373, tmp5377, 221);
__m512 tmp5391 = _mm512_shuffle_f32x4(tmp5374, tmp5378, 136);
__m512 tmp5392 = _mm512_shuffle_f32x4(tmp5374, tmp5378, 221);
__m512 tmp5393 = _mm512_shuffle_f32x4(tmp5375, tmp5379, 136);
__m512 tmp5394 = _mm512_shuffle_f32x4(tmp5375, tmp5379, 221);
__m512 tmp5395 = _mm512_shuffle_f32x4(tmp5376, tmp5380, 136);
__m512 tmp5396 = _mm512_shuffle_f32x4(tmp5376, tmp5380, 221);
wt163 = _mm512_shuffle_f32x4(tmp5381, tmp5389, 136);
wt171 = _mm512_shuffle_f32x4(tmp5381, tmp5389, 221);
wt164 = _mm512_shuffle_f32x4(tmp5383, tmp5391, 136);
wt172 = _mm512_shuffle_f32x4(tmp5383, tmp5391, 221);
wt165 = _mm512_shuffle_f32x4(tmp5385, tmp5393, 136);
wt173 = _mm512_shuffle_f32x4(tmp5385, tmp5393, 221);
wt166 = _mm512_shuffle_f32x4(tmp5387, tmp5395, 136);
wt174 = _mm512_shuffle_f32x4(tmp5387, tmp5395, 221);
wt167 = _mm512_shuffle_f32x4(tmp5382, tmp5390, 136);
wt175 = _mm512_shuffle_f32x4(tmp5382, tmp5390, 221);
wt168 = _mm512_shuffle_f32x4(tmp5384, tmp5392, 136);
wt176 = _mm512_shuffle_f32x4(tmp5384, tmp5392, 221);
wt169 = _mm512_shuffle_f32x4(tmp5386, tmp5394, 136);
wt177 = _mm512_shuffle_f32x4(tmp5386, tmp5394, 221);
wt170 = _mm512_shuffle_f32x4(tmp5388, tmp5396, 136);
wt178 = _mm512_shuffle_f32x4(tmp5388, tmp5396, 221);
wt163 = _mm512_mul_ps(wt163, postMul17);
wt164 = _mm512_mul_ps(wt164, postMul17);
wt165 = _mm512_mul_ps(wt165, postMul17);
wt166 = _mm512_mul_ps(wt166, postMul17);
wt167 = _mm512_mul_ps(wt167, postMul17);
wt168 = _mm512_mul_ps(wt168, postMul17);
wt169 = _mm512_mul_ps(wt169, postMul17);
wt170 = _mm512_mul_ps(wt170, postMul17);
wt171 = _mm512_mul_ps(wt171, postMul17);
wt172 = _mm512_mul_ps(wt172, postMul17);
wt173 = _mm512_mul_ps(wt173, postMul17);
wt174 = _mm512_mul_ps(wt174, postMul17);
wt175 = _mm512_mul_ps(wt175, postMul17);
wt176 = _mm512_mul_ps(wt176, postMul17);
wt177 = _mm512_mul_ps(wt177, postMul17);
wt178 = _mm512_mul_ps(wt178, postMul17);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(1+16*c16)+(ptrdiff_t)0, 63>>cut7, wt163);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(2+16*c16)+(ptrdiff_t)0, 63>>cut7, wt164);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(3+16*c16)+(ptrdiff_t)0, 63>>cut7, wt165);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(4+16*c16)+(ptrdiff_t)0, 63>>cut7, wt166);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(5+16*c16)+(ptrdiff_t)0, 63>>cut7, wt167);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(6+16*c16)+(ptrdiff_t)0, 63>>cut7, wt168);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(7+16*c16)+(ptrdiff_t)0, 63>>cut7, wt169);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(8+16*c16)+(ptrdiff_t)0, 63>>cut7, wt170);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(9+16*c16)+(ptrdiff_t)0, 63>>cut7, wt171);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(10+16*c16)+(ptrdiff_t)0, 63>>cut7, wt172);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(11+16*c16)+(ptrdiff_t)0, 63>>cut7, wt173);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(12+16*c16)+(ptrdiff_t)0, 63>>cut7, wt174);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(13+16*c16)+(ptrdiff_t)0, 63>>cut7, wt175);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(14+16*c16)+(ptrdiff_t)0, 63>>cut7, wt176);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(15+16*c16)+(ptrdiff_t)0, 63>>cut7, wt177);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(16+16*c16)+(ptrdiff_t)0, 63>>cut7, wt178);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(1+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt163);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(2+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt164);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(3+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt165);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(4+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt166);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(5+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt167);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(6+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt168);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(7+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt169);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(8+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt170);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(9+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt171);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(10+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt172);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(11+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt173);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(12+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt174);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(13+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt175);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(14+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt176);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(15+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt177);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(16+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt178);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(1+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt163);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(2+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt164);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(3+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt165);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(4+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt166);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(5+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt167);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(6+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt168);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(7+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt169);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(8+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt170);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(9+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt171);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(10+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt172);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(11+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt173);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(12+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt174);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(13+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt175);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(14+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt176);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(15+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt177);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(16+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt178);
}
break;
}
default: {
cut7 = 4;
__m512 sum115 = _mm512_maskz_loadu_ps(65535, biasPtr6+512*i23+4*k75);
__m512i pmMul11 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd11 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo9 = _mm512_loadu_ps(bnPtr7+(ptrdiff_t)8*(k75+128*i23));
__m512 masHi9 = _mm512_maskz_loadu_ps(65535, bnPtr7+(ptrdiff_t)8*(k75+128*i23)+(ptrdiff_t)64);
__m512 postMul18 = _mm512_permutex2var_ps(masLo9, pmMul11, masHi9);
__m512 postAdd12 = _mm512_permutex2var_ps(masLo9, pmAdd11, masHi9);
sum115 = _mm512_fmadd_ps(sum115, postMul18, postAdd12);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*0+(ptrdiff_t)0, 63>>cut7, sum115);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*0+(ptrdiff_t)6144, 4032>>cut7, sum115);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*0+(ptrdiff_t)12288, 258048>>cut7, sum115);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*0+(ptrdiff_t)18432, 65535-(262143>>cut7), sum115);
ptrdiff_t c17 = 0;
for (; c17 != 16; ++c17) {
__m512 wt179 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)0);
__m512 wt180 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)1024);
__m512 wt181 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)2048);
__m512 wt182 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)3072);
__m512 wt183 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)4096);
__m512 wt184 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)5120);
__m512 wt185 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)6144);
__m512 wt186 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)7168);
__m512 wt187 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)8192);
__m512 wt188 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)9216);
__m512 wt189 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)10240);
__m512 wt190 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)11264);
__m512 wt191 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)12288);
__m512 wt192 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)13312);
__m512 wt193 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)14336);
__m512 wt194 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)15360);
__m512 tmp5397 = _mm512_unpacklo_ps(wt179, wt180);
__m512 tmp5398 = _mm512_unpackhi_ps(wt179, wt180);
__m512 tmp5399 = _mm512_unpacklo_ps(wt181, wt182);
__m512 tmp5400 = _mm512_unpackhi_ps(wt181, wt182);
__m512 tmp5401 = _mm512_unpacklo_ps(wt183, wt184);
__m512 tmp5402 = _mm512_unpackhi_ps(wt183, wt184);
__m512 tmp5403 = _mm512_unpacklo_ps(wt185, wt186);
__m512 tmp5404 = _mm512_unpackhi_ps(wt185, wt186);
__m512 tmp5405 = _mm512_unpacklo_ps(wt187, wt188);
__m512 tmp5406 = _mm512_unpackhi_ps(wt187, wt188);
__m512 tmp5407 = _mm512_unpacklo_ps(wt189, wt190);
__m512 tmp5408 = _mm512_unpackhi_ps(wt189, wt190);
__m512 tmp5409 = _mm512_unpacklo_ps(wt191, wt192);
__m512 tmp5410 = _mm512_unpackhi_ps(wt191, wt192);
__m512 tmp5411 = _mm512_unpacklo_ps(wt193, wt194);
__m512 tmp5412 = _mm512_unpackhi_ps(wt193, wt194);
__m512 tmp5413 = _mm512_shuffle_ps(tmp5397, tmp5399, 68);
__m512 tmp5414 = _mm512_shuffle_ps(tmp5397, tmp5399, 238);
__m512 tmp5415 = _mm512_shuffle_ps(tmp5398, tmp5400, 68);
__m512 tmp5416 = _mm512_shuffle_ps(tmp5398, tmp5400, 238);
__m512 tmp5417 = _mm512_shuffle_ps(tmp5401, tmp5403, 68);
__m512 tmp5418 = _mm512_shuffle_ps(tmp5401, tmp5403, 238);
__m512 tmp5419 = _mm512_shuffle_ps(tmp5402, tmp5404, 68);
__m512 tmp5420 = _mm512_shuffle_ps(tmp5402, tmp5404, 238);
__m512 tmp5421 = _mm512_shuffle_ps(tmp5405, tmp5407, 68);
__m512 tmp5422 = _mm512_shuffle_ps(tmp5405, tmp5407, 238);
__m512 tmp5423 = _mm512_shuffle_ps(tmp5406, tmp5408, 68);
__m512 tmp5424 = _mm512_shuffle_ps(tmp5406, tmp5408, 238);
__m512 tmp5425 = _mm512_shuffle_ps(tmp5409, tmp5411, 68);
__m512 tmp5426 = _mm512_shuffle_ps(tmp5409, tmp5411, 238);
__m512 tmp5427 = _mm512_shuffle_ps(tmp5410, tmp5412, 68);
__m512 tmp5428 = _mm512_shuffle_ps(tmp5410, tmp5412, 238);
__m512 tmp5429 = _mm512_shuffle_f32x4(tmp5413, tmp5417, 136);
__m512 tmp5430 = _mm512_shuffle_f32x4(tmp5413, tmp5417, 221);
__m512 tmp5431 = _mm512_shuffle_f32x4(tmp5414, tmp5418, 136);
__m512 tmp5432 = _mm512_shuffle_f32x4(tmp5414, tmp5418, 221);
__m512 tmp5433 = _mm512_shuffle_f32x4(tmp5415, tmp5419, 136);
__m512 tmp5434 = _mm512_shuffle_f32x4(tmp5415, tmp5419, 221);
__m512 tmp5435 = _mm512_shuffle_f32x4(tmp5416, tmp5420, 136);
__m512 tmp5436 = _mm512_shuffle_f32x4(tmp5416, tmp5420, 221);
__m512 tmp5437 = _mm512_shuffle_f32x4(tmp5421, tmp5425, 136);
__m512 tmp5438 = _mm512_shuffle_f32x4(tmp5421, tmp5425, 221);
__m512 tmp5439 = _mm512_shuffle_f32x4(tmp5422, tmp5426, 136);
__m512 tmp5440 = _mm512_shuffle_f32x4(tmp5422, tmp5426, 221);
__m512 tmp5441 = _mm512_shuffle_f32x4(tmp5423, tmp5427, 136);
__m512 tmp5442 = _mm512_shuffle_f32x4(tmp5423, tmp5427, 221);
__m512 tmp5443 = _mm512_shuffle_f32x4(tmp5424, tmp5428, 136);
__m512 tmp5444 = _mm512_shuffle_f32x4(tmp5424, tmp5428, 221);
wt179 = _mm512_shuffle_f32x4(tmp5429, tmp5437, 136);
wt187 = _mm512_shuffle_f32x4(tmp5429, tmp5437, 221);
wt180 = _mm512_shuffle_f32x4(tmp5431, tmp5439, 136);
wt188 = _mm512_shuffle_f32x4(tmp5431, tmp5439, 221);
wt181 = _mm512_shuffle_f32x4(tmp5433, tmp5441, 136);
wt189 = _mm512_shuffle_f32x4(tmp5433, tmp5441, 221);
wt182 = _mm512_shuffle_f32x4(tmp5435, tmp5443, 136);
wt190 = _mm512_shuffle_f32x4(tmp5435, tmp5443, 221);
wt183 = _mm512_shuffle_f32x4(tmp5430, tmp5438, 136);
wt191 = _mm512_shuffle_f32x4(tmp5430, tmp5438, 221);
wt184 = _mm512_shuffle_f32x4(tmp5432, tmp5440, 136);
wt192 = _mm512_shuffle_f32x4(tmp5432, tmp5440, 221);
wt185 = _mm512_shuffle_f32x4(tmp5434, tmp5442, 136);
wt193 = _mm512_shuffle_f32x4(tmp5434, tmp5442, 221);
wt186 = _mm512_shuffle_f32x4(tmp5436, tmp5444, 136);
wt194 = _mm512_shuffle_f32x4(tmp5436, tmp5444, 221);
wt179 = _mm512_mul_ps(wt179, postMul18);
wt180 = _mm512_mul_ps(wt180, postMul18);
wt181 = _mm512_mul_ps(wt181, postMul18);
wt182 = _mm512_mul_ps(wt182, postMul18);
wt183 = _mm512_mul_ps(wt183, postMul18);
wt184 = _mm512_mul_ps(wt184, postMul18);
wt185 = _mm512_mul_ps(wt185, postMul18);
wt186 = _mm512_mul_ps(wt186, postMul18);
wt187 = _mm512_mul_ps(wt187, postMul18);
wt188 = _mm512_mul_ps(wt188, postMul18);
wt189 = _mm512_mul_ps(wt189, postMul18);
wt190 = _mm512_mul_ps(wt190, postMul18);
wt191 = _mm512_mul_ps(wt191, postMul18);
wt192 = _mm512_mul_ps(wt192, postMul18);
wt193 = _mm512_mul_ps(wt193, postMul18);
wt194 = _mm512_mul_ps(wt194, postMul18);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(1+16*c17)+(ptrdiff_t)0, 63>>cut7, wt179);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(2+16*c17)+(ptrdiff_t)0, 63>>cut7, wt180);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(3+16*c17)+(ptrdiff_t)0, 63>>cut7, wt181);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(4+16*c17)+(ptrdiff_t)0, 63>>cut7, wt182);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(5+16*c17)+(ptrdiff_t)0, 63>>cut7, wt183);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(6+16*c17)+(ptrdiff_t)0, 63>>cut7, wt184);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(7+16*c17)+(ptrdiff_t)0, 63>>cut7, wt185);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(8+16*c17)+(ptrdiff_t)0, 63>>cut7, wt186);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(9+16*c17)+(ptrdiff_t)0, 63>>cut7, wt187);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(10+16*c17)+(ptrdiff_t)0, 63>>cut7, wt188);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(11+16*c17)+(ptrdiff_t)0, 63>>cut7, wt189);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(12+16*c17)+(ptrdiff_t)0, 63>>cut7, wt190);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(13+16*c17)+(ptrdiff_t)0, 63>>cut7, wt191);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(14+16*c17)+(ptrdiff_t)0, 63>>cut7, wt192);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(15+16*c17)+(ptrdiff_t)0, 63>>cut7, wt193);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(16+16*c17)+(ptrdiff_t)0, 63>>cut7, wt194);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(1+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt179);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(2+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt180);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(3+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt181);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(4+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt182);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(5+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt183);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(6+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt184);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(7+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt185);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(8+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt186);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(9+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt187);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(10+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt188);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(11+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt189);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(12+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt190);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(13+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt191);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(14+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt192);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(15+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt193);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(16+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt194);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(1+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt179);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(2+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt180);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(3+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt181);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(4+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt182);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(5+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt183);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(6+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt184);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(7+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt185);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(8+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt186);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(9+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt187);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(10+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt188);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(11+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt189);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(12+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt190);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(13+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt191);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(14+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt192);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(15+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt193);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(16+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt194);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(1+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt179);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(2+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt180);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(3+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt181);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(4+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt182);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(5+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt183);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(6+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt184);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(7+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt185);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(8+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt186);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(9+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt187);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(10+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt188);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(11+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt189);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(12+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt190);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(13+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt191);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(14+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt192);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(15+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt193);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(16+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt194);
}
}
}
} else {
ptrdiff_t k74 = 112;
ptrdiff_t l24 = (size_t)(0+k74)/6;
ptrdiff_t cut6 = (size_t)(0+k74)%6;
__m512 sum113 = _mm512_maskz_loadu_ps(65535, biasPtr6+512*i23+4*k74);
__m512i pmMul12 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd12 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo10 = _mm512_loadu_ps(bnPtr7+(ptrdiff_t)8*(k74+128*i23));
__m512 masHi10 = _mm512_maskz_loadu_ps(65535, bnPtr7+(ptrdiff_t)8*(k74+128*i23)+(ptrdiff_t)64);
__m512 postMul16 = _mm512_permutex2var_ps(masLo10, pmMul12, masHi10);
__m512 postAdd10 = _mm512_permutex2var_ps(masLo10, pmAdd12, masHi10);
sum113 = _mm512_fmadd_ps(sum113, postMul16, postAdd10);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*0+(ptrdiff_t)0, 63>>cut6, sum113);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*0+(ptrdiff_t)6144, 4032>>cut6, sum113);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*0+(ptrdiff_t)12288, 258048>>cut6, sum113);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*0+(ptrdiff_t)18432, 65535-(262143>>cut6), sum113);
ptrdiff_t c15 = 0;
for (; c15 != 16; ++c15) {
__m512 wt147 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)0);
__m512 wt148 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)1024);
__m512 wt149 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)2048);
__m512 wt150 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)3072);
__m512 wt151 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)4096);
__m512 wt152 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)5120);
__m512 wt153 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)6144);
__m512 wt154 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)7168);
__m512 wt155 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)8192);
__m512 wt156 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)9216);
__m512 wt157 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)10240);
__m512 wt158 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)11264);
__m512 wt159 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)12288);
__m512 wt160 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)13312);
__m512 wt161 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)14336);
__m512 wt162 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)15360);
__m512 tmp5445 = _mm512_unpacklo_ps(wt147, wt148);
__m512 tmp5446 = _mm512_unpackhi_ps(wt147, wt148);
__m512 tmp5447 = _mm512_unpacklo_ps(wt149, wt150);
__m512 tmp5448 = _mm512_unpackhi_ps(wt149, wt150);
__m512 tmp5449 = _mm512_unpacklo_ps(wt151, wt152);
__m512 tmp5450 = _mm512_unpackhi_ps(wt151, wt152);
__m512 tmp5451 = _mm512_unpacklo_ps(wt153, wt154);
__m512 tmp5452 = _mm512_unpackhi_ps(wt153, wt154);
__m512 tmp5453 = _mm512_unpacklo_ps(wt155, wt156);
__m512 tmp5454 = _mm512_unpackhi_ps(wt155, wt156);
__m512 tmp5455 = _mm512_unpacklo_ps(wt157, wt158);
__m512 tmp5456 = _mm512_unpackhi_ps(wt157, wt158);
__m512 tmp5457 = _mm512_unpacklo_ps(wt159, wt160);
__m512 tmp5458 = _mm512_unpackhi_ps(wt159, wt160);
__m512 tmp5459 = _mm512_unpacklo_ps(wt161, wt162);
__m512 tmp5460 = _mm512_unpackhi_ps(wt161, wt162);
__m512 tmp5461 = _mm512_shuffle_ps(tmp5445, tmp5447, 68);
__m512 tmp5462 = _mm512_shuffle_ps(tmp5445, tmp5447, 238);
__m512 tmp5463 = _mm512_shuffle_ps(tmp5446, tmp5448, 68);
__m512 tmp5464 = _mm512_shuffle_ps(tmp5446, tmp5448, 238);
__m512 tmp5465 = _mm512_shuffle_ps(tmp5449, tmp5451, 68);
__m512 tmp5466 = _mm512_shuffle_ps(tmp5449, tmp5451, 238);
__m512 tmp5467 = _mm512_shuffle_ps(tmp5450, tmp5452, 68);
__m512 tmp5468 = _mm512_shuffle_ps(tmp5450, tmp5452, 238);
__m512 tmp5469 = _mm512_shuffle_ps(tmp5453, tmp5455, 68);
__m512 tmp5470 = _mm512_shuffle_ps(tmp5453, tmp5455, 238);
__m512 tmp5471 = _mm512_shuffle_ps(tmp5454, tmp5456, 68);
__m512 tmp5472 = _mm512_shuffle_ps(tmp5454, tmp5456, 238);
__m512 tmp5473 = _mm512_shuffle_ps(tmp5457, tmp5459, 68);
__m512 tmp5474 = _mm512_shuffle_ps(tmp5457, tmp5459, 238);
__m512 tmp5475 = _mm512_shuffle_ps(tmp5458, tmp5460, 68);
__m512 tmp5476 = _mm512_shuffle_ps(tmp5458, tmp5460, 238);
__m512 tmp5477 = _mm512_shuffle_f32x4(tmp5461, tmp5465, 136);
__m512 tmp5478 = _mm512_shuffle_f32x4(tmp5461, tmp5465, 221);
__m512 tmp5479 = _mm512_shuffle_f32x4(tmp5462, tmp5466, 136);
__m512 tmp5480 = _mm512_shuffle_f32x4(tmp5462, tmp5466, 221);
__m512 tmp5481 = _mm512_shuffle_f32x4(tmp5463, tmp5467, 136);
__m512 tmp5482 = _mm512_shuffle_f32x4(tmp5463, tmp5467, 221);
__m512 tmp5483 = _mm512_shuffle_f32x4(tmp5464, tmp5468, 136);
__m512 tmp5484 = _mm512_shuffle_f32x4(tmp5464, tmp5468, 221);
__m512 tmp5485 = _mm512_shuffle_f32x4(tmp5469, tmp5473, 136);
__m512 tmp5486 = _mm512_shuffle_f32x4(tmp5469, tmp5473, 221);
__m512 tmp5487 = _mm512_shuffle_f32x4(tmp5470, tmp5474, 136);
__m512 tmp5488 = _mm512_shuffle_f32x4(tmp5470, tmp5474, 221);
__m512 tmp5489 = _mm512_shuffle_f32x4(tmp5471, tmp5475, 136);
__m512 tmp5490 = _mm512_shuffle_f32x4(tmp5471, tmp5475, 221);
__m512 tmp5491 = _mm512_shuffle_f32x4(tmp5472, tmp5476, 136);
__m512 tmp5492 = _mm512_shuffle_f32x4(tmp5472, tmp5476, 221);
wt147 = _mm512_shuffle_f32x4(tmp5477, tmp5485, 136);
wt155 = _mm512_shuffle_f32x4(tmp5477, tmp5485, 221);
wt148 = _mm512_shuffle_f32x4(tmp5479, tmp5487, 136);
wt156 = _mm512_shuffle_f32x4(tmp5479, tmp5487, 221);
wt149 = _mm512_shuffle_f32x4(tmp5481, tmp5489, 136);
wt157 = _mm512_shuffle_f32x4(tmp5481, tmp5489, 221);
wt150 = _mm512_shuffle_f32x4(tmp5483, tmp5491, 136);
wt158 = _mm512_shuffle_f32x4(tmp5483, tmp5491, 221);
wt151 = _mm512_shuffle_f32x4(tmp5478, tmp5486, 136);
wt159 = _mm512_shuffle_f32x4(tmp5478, tmp5486, 221);
wt152 = _mm512_shuffle_f32x4(tmp5480, tmp5488, 136);
wt160 = _mm512_shuffle_f32x4(tmp5480, tmp5488, 221);
wt153 = _mm512_shuffle_f32x4(tmp5482, tmp5490, 136);
wt161 = _mm512_shuffle_f32x4(tmp5482, tmp5490, 221);
wt154 = _mm512_shuffle_f32x4(tmp5484, tmp5492, 136);
wt162 = _mm512_shuffle_f32x4(tmp5484, tmp5492, 221);
wt147 = _mm512_mul_ps(wt147, postMul16);
wt148 = _mm512_mul_ps(wt148, postMul16);
wt149 = _mm512_mul_ps(wt149, postMul16);
wt150 = _mm512_mul_ps(wt150, postMul16);
wt151 = _mm512_mul_ps(wt151, postMul16);
wt152 = _mm512_mul_ps(wt152, postMul16);
wt153 = _mm512_mul_ps(wt153, postMul16);
wt154 = _mm512_mul_ps(wt154, postMul16);
wt155 = _mm512_mul_ps(wt155, postMul16);
wt156 = _mm512_mul_ps(wt156, postMul16);
wt157 = _mm512_mul_ps(wt157, postMul16);
wt158 = _mm512_mul_ps(wt158, postMul16);
wt159 = _mm512_mul_ps(wt159, postMul16);
wt160 = _mm512_mul_ps(wt160, postMul16);
wt161 = _mm512_mul_ps(wt161, postMul16);
wt162 = _mm512_mul_ps(wt162, postMul16);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(1+16*c15)+(ptrdiff_t)0, 63>>cut6, wt147);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(2+16*c15)+(ptrdiff_t)0, 63>>cut6, wt148);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(3+16*c15)+(ptrdiff_t)0, 63>>cut6, wt149);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(4+16*c15)+(ptrdiff_t)0, 63>>cut6, wt150);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(5+16*c15)+(ptrdiff_t)0, 63>>cut6, wt151);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(6+16*c15)+(ptrdiff_t)0, 63>>cut6, wt152);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(7+16*c15)+(ptrdiff_t)0, 63>>cut6, wt153);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(8+16*c15)+(ptrdiff_t)0, 63>>cut6, wt154);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(9+16*c15)+(ptrdiff_t)0, 63>>cut6, wt155);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(10+16*c15)+(ptrdiff_t)0, 63>>cut6, wt156);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(11+16*c15)+(ptrdiff_t)0, 63>>cut6, wt157);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(12+16*c15)+(ptrdiff_t)0, 63>>cut6, wt158);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(13+16*c15)+(ptrdiff_t)0, 63>>cut6, wt159);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(14+16*c15)+(ptrdiff_t)0, 63>>cut6, wt160);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(15+16*c15)+(ptrdiff_t)0, 63>>cut6, wt161);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(16+16*c15)+(ptrdiff_t)0, 63>>cut6, wt162);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(1+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt147);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(2+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt148);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(3+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt149);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(4+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt150);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(5+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt151);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(6+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt152);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(7+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt153);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(8+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt154);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(9+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt155);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(10+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt156);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(11+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt157);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(12+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt158);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(13+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt159);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(14+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt160);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(15+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt161);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(16+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt162);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(1+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt147);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(2+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt148);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(3+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt149);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(4+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt150);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(5+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt151);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(6+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt152);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(7+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt153);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(8+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt154);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(9+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt155);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(10+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt156);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(11+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt157);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(12+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt158);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(13+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt159);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(14+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt160);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(15+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt161);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(16+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt162);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(1+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt147);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(2+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt148);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(3+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt149);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(4+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt150);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(5+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt151);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(6+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt152);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(7+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt153);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(8+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt154);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(9+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt155);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(10+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt156);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(11+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt157);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(12+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt158);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(13+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt159);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(14+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt160);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(15+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt161);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(16+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt162);
}
}
}
}
}

static void ResNeXt50OneArrangeWts3(ResNeXt50ThreaderTeam1* team29, char** tensors31) {
ResNeXt50ThreaderTask1 task35;
task35.callee1 = ResNeXt50OneArrangeWts3Callee1;
task35.any1 = tensors31;
task35.nd1 = 3;
task35.hull1[0] = 4;
task35.hull1[1] = 1;
task35.hull1[2] = 1;
ResNeXt50ThreaderDo1(team29, &task35);
}

static void ResNeXt50OneArrangeDats3Callee1(ResNeXt50ThreaderTask1* task36, int64_t* pt23) {
char** tensors34 = task36->any1;
ptrdiff_t s15 = pt23[0];
ptrdiff_t c18 = pt23[1];
char*restrict datPtr10 = tensors34[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)3227648*0;
char*restrict arranged6 = tensors34[1]+(ptrdiff_t)10474240*0+(ptrdiff_t)3211264*0;
ptrdiff_t ii12 = 1;
for (ptrdiff_t i24 = 0; i24 < ii12; ++i24) {
ptrdiff_t j18 = 1*c18;
ptrdiff_t jj27 = j18+0;
for (; j18 != 49; ++j18) {
ptrdiff_t k76 = 128*s15;
ptrdiff_t kk28 = k76+128;
for (; k76 < kk28; ++k76) {
__m512 dat1279 = _mm512_maskz_loadu_ps(65535, datPtr10+3227648*i24+256*j18+12608*k76+(ptrdiff_t)0);
__m512 dat1280 = _mm512_maskz_loadu_ps(65535, datPtr10+3227648*i24+256*j18+12608*k76+(ptrdiff_t)64);
__m512 dat1281 = _mm512_maskz_loadu_ps(65535, datPtr10+3227648*i24+256*j18+12608*k76+(ptrdiff_t)128);
__m512 dat1282 = _mm512_maskz_loadu_ps(65535, datPtr10+3227648*i24+256*j18+12608*k76+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged6+3211264*i24+65536*j18+256*k76+(ptrdiff_t)0, 65535, dat1279);
_mm512_mask_storeu_ps(arranged6+3211264*i24+65536*j18+256*k76+(ptrdiff_t)64, 65535, dat1280);
_mm512_mask_storeu_ps(arranged6+3211264*i24+65536*j18+256*k76+(ptrdiff_t)128, 65535, dat1281);
_mm512_mask_storeu_ps(arranged6+3211264*i24+65536*j18+256*k76+(ptrdiff_t)192, 65535, dat1282);
}
if (j18 >= jj27) goto next3;
}
next3:;
}
}

static void ResNeXt50OneArrangeDats3(ResNeXt50ThreaderTeam1* team30, char** tensors33) {
ResNeXt50ThreaderTask1 task37;
task37.callee1 = ResNeXt50OneArrangeDats3Callee1;
task37.any1 = tensors33;
task37.nd1 = 4;
task37.hull1[0] = 2;
task37.hull1[1] = 49;
task37.hull1[2] = 1;
task37.hull1[3] = 1;
ResNeXt50ThreaderDo1(team30, &task37);
}

static void ResNeXt50OneApply3Callee1(ResNeXt50ThreaderTask1* task38, int64_t* pt24) {
void** pair8 = task38->any1;
char** tensors36 = pair8[0];
ptrdiff_t e11 = 0;
ptrdiff_t g12 = 0;
ptrdiff_t d7 = pt24[1];
ptrdiff_t w35 = pt24[0];
char*restrict arrangedWts3 = tensors36[0]+428032*e11+(ptrdiff_t)131584*1*g12;
char*restrict arrangedDats3 = tensors36[1]+10474240*e11+(ptrdiff_t)3211264*1*g12;
char*restrict datPtr11 = tensors36[2]+(ptrdiff_t)1613824*1*g12;
ptrdiff_t ii13 = 1;
for (ptrdiff_t i25 = 0; i25 < ii13; ++i25) {
ptrdiff_t j19 = 1*d7;
ptrdiff_t jj28 = j19+0;
for (; j19 != 49; ++j19) {
ptrdiff_t k77 = 2*w35;
ptrdiff_t kk29 = k77+1;
for (; k77 != 21; ++k77) {
ptrdiff_t s16 = -1;
__m512 sum116 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)24));
__m512 sum120 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)28));
__m512 sum124 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)32));
__m512 sum128 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)36));
__m512 sum132 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)40));
__m512 sum136 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)44));
__m512 sum117 = sum116;
__m512 sum118 = sum116;
__m512 sum119 = sum116;
__m512 sum121 = sum120;
__m512 sum122 = sum120;
__m512 sum123 = sum120;
__m512 sum125 = sum124;
__m512 sum126 = sum124;
__m512 sum127 = sum124;
__m512 sum129 = sum128;
__m512 sum130 = sum128;
__m512 sum131 = sum128;
__m512 sum133 = sum132;
__m512 sum134 = sum132;
__m512 sum135 = sum132;
__m512 sum137 = sum136;
__m512 sum138 = sum136;
__m512 sum139 = sum136;
for (s16 = 0; s16 < 256; ++s16) {
__m512 dat1283 = _mm512_loadu_ps(arrangedDats3+3211264*i25+65536*j19+256*s16+(ptrdiff_t)0);
__m512 dat1284 = _mm512_loadu_ps(arrangedDats3+3211264*i25+65536*j19+256*s16+(ptrdiff_t)64);
__m512 dat1285 = _mm512_loadu_ps(arrangedDats3+3211264*i25+65536*j19+256*s16+(ptrdiff_t)128);
__m512 dat1286 = _mm512_loadu_ps(arrangedDats3+3211264*i25+65536*j19+256*s16+(ptrdiff_t)192);
__m512 wt195 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)24));
sum116 = _mm512_fmadd_ps(wt195, dat1283, sum116);
sum117 = _mm512_fmadd_ps(wt195, dat1284, sum117);
sum118 = _mm512_fmadd_ps(wt195, dat1285, sum118);
sum119 = _mm512_fmadd_ps(wt195, dat1286, sum119);
__m512 wt196 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)28));
sum120 = _mm512_fmadd_ps(wt196, dat1283, sum120);
sum121 = _mm512_fmadd_ps(wt196, dat1284, sum121);
sum122 = _mm512_fmadd_ps(wt196, dat1285, sum122);
sum123 = _mm512_fmadd_ps(wt196, dat1286, sum123);
__m512 wt197 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)32));
sum124 = _mm512_fmadd_ps(wt197, dat1283, sum124);
sum125 = _mm512_fmadd_ps(wt197, dat1284, sum125);
sum126 = _mm512_fmadd_ps(wt197, dat1285, sum126);
sum127 = _mm512_fmadd_ps(wt197, dat1286, sum127);
__m512 wt198 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)36));
sum128 = _mm512_fmadd_ps(wt198, dat1283, sum128);
sum129 = _mm512_fmadd_ps(wt198, dat1284, sum129);
sum130 = _mm512_fmadd_ps(wt198, dat1285, sum130);
sum131 = _mm512_fmadd_ps(wt198, dat1286, sum131);
__m512 wt199 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)40));
sum132 = _mm512_fmadd_ps(wt199, dat1283, sum132);
sum133 = _mm512_fmadd_ps(wt199, dat1284, sum133);
sum134 = _mm512_fmadd_ps(wt199, dat1285, sum134);
sum135 = _mm512_fmadd_ps(wt199, dat1286, sum135);
__m512 wt200 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)44));
sum136 = _mm512_fmadd_ps(wt200, dat1283, sum136);
sum137 = _mm512_fmadd_ps(wt200, dat1284, sum137);
sum138 = _mm512_fmadd_ps(wt200, dat1285, sum138);
sum139 = _mm512_fmadd_ps(wt200, dat1286, sum139);
}
sum116 = _mm512_max_ps(_mm512_setzero_ps(), sum116);
sum117 = _mm512_max_ps(_mm512_setzero_ps(), sum117);
sum118 = _mm512_max_ps(_mm512_setzero_ps(), sum118);
sum119 = _mm512_max_ps(_mm512_setzero_ps(), sum119);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)0, 65535, sum116);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)64, 65535, sum117);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)128, 65535, sum118);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)192, 65535, sum119);
sum120 = _mm512_max_ps(_mm512_setzero_ps(), sum120);
sum121 = _mm512_max_ps(_mm512_setzero_ps(), sum121);
sum122 = _mm512_max_ps(_mm512_setzero_ps(), sum122);
sum123 = _mm512_max_ps(_mm512_setzero_ps(), sum123);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12608, 65535, sum120);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12672, 65535, sum121);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12736, 65535, sum122);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12800, 65535, sum123);
sum124 = _mm512_max_ps(_mm512_setzero_ps(), sum124);
sum125 = _mm512_max_ps(_mm512_setzero_ps(), sum125);
sum126 = _mm512_max_ps(_mm512_setzero_ps(), sum126);
sum127 = _mm512_max_ps(_mm512_setzero_ps(), sum127);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)25216, 65535, sum124);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)25280, 65535, sum125);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)25344, 65535, sum126);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)25408, 65535, sum127);
sum128 = _mm512_max_ps(_mm512_setzero_ps(), sum128);
sum129 = _mm512_max_ps(_mm512_setzero_ps(), sum129);
sum130 = _mm512_max_ps(_mm512_setzero_ps(), sum130);
sum131 = _mm512_max_ps(_mm512_setzero_ps(), sum131);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)37824, 65535, sum128);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)37888, 65535, sum129);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)37952, 65535, sum130);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)38016, 65535, sum131);
sum132 = _mm512_max_ps(_mm512_setzero_ps(), sum132);
sum133 = _mm512_max_ps(_mm512_setzero_ps(), sum133);
sum134 = _mm512_max_ps(_mm512_setzero_ps(), sum134);
sum135 = _mm512_max_ps(_mm512_setzero_ps(), sum135);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)50432, 65535, sum132);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)50496, 65535, sum133);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)50560, 65535, sum134);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)50624, 65535, sum135);
sum136 = _mm512_max_ps(_mm512_setzero_ps(), sum136);
sum137 = _mm512_max_ps(_mm512_setzero_ps(), sum137);
sum138 = _mm512_max_ps(_mm512_setzero_ps(), sum138);
sum139 = _mm512_max_ps(_mm512_setzero_ps(), sum139);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)63040, 65535, sum136);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)63104, 65535, sum137);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)63168, 65535, sum138);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)63232, 65535, sum139);
if (k77 >= kk29) return;
}
ptrdiff_t s17 = -1;
__m512 sum140 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+8*s17+(ptrdiff_t)8));
__m512 sum144 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+8*s17+(ptrdiff_t)12));
__m512 sum141 = sum140;
__m512 sum142 = sum140;
__m512 sum143 = sum140;
__m512 sum145 = sum144;
__m512 sum146 = sum144;
__m512 sum147 = sum144;
for (s17 = 0; s17 < 256; ++s17) {
__m512 dat1287 = _mm512_loadu_ps(arrangedDats3+3211264*i25+65536*j19+256*s17+(ptrdiff_t)0);
__m512 dat1288 = _mm512_loadu_ps(arrangedDats3+3211264*i25+65536*j19+256*s17+(ptrdiff_t)64);
__m512 dat1289 = _mm512_loadu_ps(arrangedDats3+3211264*i25+65536*j19+256*s17+(ptrdiff_t)128);
__m512 dat1290 = _mm512_loadu_ps(arrangedDats3+3211264*i25+65536*j19+256*s17+(ptrdiff_t)192);
__m512 wt201 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+8*s17+(ptrdiff_t)8));
sum140 = _mm512_fmadd_ps(wt201, dat1287, sum140);
sum141 = _mm512_fmadd_ps(wt201, dat1288, sum141);
sum142 = _mm512_fmadd_ps(wt201, dat1289, sum142);
sum143 = _mm512_fmadd_ps(wt201, dat1290, sum143);
__m512 wt202 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+8*s17+(ptrdiff_t)12));
sum144 = _mm512_fmadd_ps(wt202, dat1287, sum144);
sum145 = _mm512_fmadd_ps(wt202, dat1288, sum145);
sum146 = _mm512_fmadd_ps(wt202, dat1289, sum146);
sum147 = _mm512_fmadd_ps(wt202, dat1290, sum147);
}
sum140 = _mm512_max_ps(_mm512_setzero_ps(), sum140);
sum141 = _mm512_max_ps(_mm512_setzero_ps(), sum141);
sum142 = _mm512_max_ps(_mm512_setzero_ps(), sum142);
sum143 = _mm512_max_ps(_mm512_setzero_ps(), sum143);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)0, 65535, sum140);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)64, 65535, sum141);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)128, 65535, sum142);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)192, 65535, sum143);
sum144 = _mm512_max_ps(_mm512_setzero_ps(), sum144);
sum145 = _mm512_max_ps(_mm512_setzero_ps(), sum145);
sum146 = _mm512_max_ps(_mm512_setzero_ps(), sum146);
sum147 = _mm512_max_ps(_mm512_setzero_ps(), sum147);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12608, 65535, sum144);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12672, 65535, sum145);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12736, 65535, sum146);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12800, 65535, sum147);
if (j19 >= jj28) return;
}
}
}

static void ResNeXt50OneApply3(ResNeXt50ThreaderTeam1* team31, char** tensors35) {
void* pair7[] = {tensors35, 0};
ResNeXt50ThreaderTask1 task39;
task39.callee1 = ResNeXt50OneApply3Callee1;
task39.any1 = pair7;
task39.nd1 = 3;
task39.hull1[0] = 11;
task39.hull1[1] = 49;
task39.hull1[2] = 1;
ResNeXt50ThreaderDo1(team31, &task39);
}

static void ResNeXt50OneArrangeWts4Callee1(ResNeXt50ThreaderTask1* task48, int64_t* pt29) {
char** tensors46 = task48->any1;
ptrdiff_t b53 = pt29[0];
char*restrict wtPtr8 = tensors46[0]+(ptrdiff_t)3340*0+(ptrdiff_t)524288*0;
char*restrict biasPtr8 = tensors46[1]+(ptrdiff_t)2048*0;
char*restrict bnPtr9 = tensors46[2]+(ptrdiff_t)8*512*0;
char*restrict arranged7 = tensors46[3]+(ptrdiff_t)1712128*0+(ptrdiff_t)526336*0;
ptrdiff_t ii18 = 1;
for (ptrdiff_t i31 = 0; i31 < ii18; ++i31) {
ptrdiff_t j24 = 2*b53;
ptrdiff_t jj30 = j24+2;
for (; j24 < jj30; ++j24) {
if (j24 < 31) {
ptrdiff_t k99 = 0+16*(j24-0);
ptrdiff_t l38 = (size_t)(0+k99)/6;
ptrdiff_t cut10 = (size_t)(0+k99)%6;
switch (cut10) {
case 0:;
case 2: {
__m512 sum189 = _mm512_maskz_loadu_ps(65535, biasPtr8+2048*i31+4*k99);
__m512i pmMul14 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd14 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo11 = _mm512_loadu_ps(bnPtr9+(ptrdiff_t)8*(k99+512*i31));
__m512 masHi11 = _mm512_maskz_loadu_ps(65535, bnPtr9+(ptrdiff_t)8*(k99+512*i31)+(ptrdiff_t)64);
__m512 postMul25 = _mm512_permutex2var_ps(masLo11, pmMul14, masHi11);
__m512 postAdd15 = _mm512_permutex2var_ps(masLo11, pmAdd14, masHi11);
sum189 = _mm512_fmadd_ps(sum189, postMul25, postAdd15);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)0, 63>>cut10, sum189);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)6144, 4032>>cut10, sum189);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)12288, 65535-(4095>>cut10), sum189);
ptrdiff_t c21 = 0;
for (; c21 != 16; ++c21) {
__m512 wt223 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)0);
__m512 wt224 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)1024);
__m512 wt225 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)2048);
__m512 wt226 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)3072);
__m512 wt227 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)4096);
__m512 wt228 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)5120);
__m512 wt229 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)6144);
__m512 wt230 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)7168);
__m512 wt231 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)8192);
__m512 wt232 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)9216);
__m512 wt233 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)10240);
__m512 wt234 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)11264);
__m512 wt235 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)12288);
__m512 wt236 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)13312);
__m512 wt237 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)14336);
__m512 wt238 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)15360);
__m512 tmp10505 = _mm512_unpacklo_ps(wt223, wt224);
__m512 tmp10506 = _mm512_unpackhi_ps(wt223, wt224);
__m512 tmp10507 = _mm512_unpacklo_ps(wt225, wt226);
__m512 tmp10508 = _mm512_unpackhi_ps(wt225, wt226);
__m512 tmp10509 = _mm512_unpacklo_ps(wt227, wt228);
__m512 tmp10510 = _mm512_unpackhi_ps(wt227, wt228);
__m512 tmp10511 = _mm512_unpacklo_ps(wt229, wt230);
__m512 tmp10512 = _mm512_unpackhi_ps(wt229, wt230);
__m512 tmp10513 = _mm512_unpacklo_ps(wt231, wt232);
__m512 tmp10514 = _mm512_unpackhi_ps(wt231, wt232);
__m512 tmp10515 = _mm512_unpacklo_ps(wt233, wt234);
__m512 tmp10516 = _mm512_unpackhi_ps(wt233, wt234);
__m512 tmp10517 = _mm512_unpacklo_ps(wt235, wt236);
__m512 tmp10518 = _mm512_unpackhi_ps(wt235, wt236);
__m512 tmp10519 = _mm512_unpacklo_ps(wt237, wt238);
__m512 tmp10520 = _mm512_unpackhi_ps(wt237, wt238);
__m512 tmp10521 = _mm512_shuffle_ps(tmp10505, tmp10507, 68);
__m512 tmp10522 = _mm512_shuffle_ps(tmp10505, tmp10507, 238);
__m512 tmp10523 = _mm512_shuffle_ps(tmp10506, tmp10508, 68);
__m512 tmp10524 = _mm512_shuffle_ps(tmp10506, tmp10508, 238);
__m512 tmp10525 = _mm512_shuffle_ps(tmp10509, tmp10511, 68);
__m512 tmp10526 = _mm512_shuffle_ps(tmp10509, tmp10511, 238);
__m512 tmp10527 = _mm512_shuffle_ps(tmp10510, tmp10512, 68);
__m512 tmp10528 = _mm512_shuffle_ps(tmp10510, tmp10512, 238);
__m512 tmp10529 = _mm512_shuffle_ps(tmp10513, tmp10515, 68);
__m512 tmp10530 = _mm512_shuffle_ps(tmp10513, tmp10515, 238);
__m512 tmp10531 = _mm512_shuffle_ps(tmp10514, tmp10516, 68);
__m512 tmp10532 = _mm512_shuffle_ps(tmp10514, tmp10516, 238);
__m512 tmp10533 = _mm512_shuffle_ps(tmp10517, tmp10519, 68);
__m512 tmp10534 = _mm512_shuffle_ps(tmp10517, tmp10519, 238);
__m512 tmp10535 = _mm512_shuffle_ps(tmp10518, tmp10520, 68);
__m512 tmp10536 = _mm512_shuffle_ps(tmp10518, tmp10520, 238);
__m512 tmp10537 = _mm512_shuffle_f32x4(tmp10521, tmp10525, 136);
__m512 tmp10538 = _mm512_shuffle_f32x4(tmp10521, tmp10525, 221);
__m512 tmp10539 = _mm512_shuffle_f32x4(tmp10522, tmp10526, 136);
__m512 tmp10540 = _mm512_shuffle_f32x4(tmp10522, tmp10526, 221);
__m512 tmp10541 = _mm512_shuffle_f32x4(tmp10523, tmp10527, 136);
__m512 tmp10542 = _mm512_shuffle_f32x4(tmp10523, tmp10527, 221);
__m512 tmp10543 = _mm512_shuffle_f32x4(tmp10524, tmp10528, 136);
__m512 tmp10544 = _mm512_shuffle_f32x4(tmp10524, tmp10528, 221);
__m512 tmp10545 = _mm512_shuffle_f32x4(tmp10529, tmp10533, 136);
__m512 tmp10546 = _mm512_shuffle_f32x4(tmp10529, tmp10533, 221);
__m512 tmp10547 = _mm512_shuffle_f32x4(tmp10530, tmp10534, 136);
__m512 tmp10548 = _mm512_shuffle_f32x4(tmp10530, tmp10534, 221);
__m512 tmp10549 = _mm512_shuffle_f32x4(tmp10531, tmp10535, 136);
__m512 tmp10550 = _mm512_shuffle_f32x4(tmp10531, tmp10535, 221);
__m512 tmp10551 = _mm512_shuffle_f32x4(tmp10532, tmp10536, 136);
__m512 tmp10552 = _mm512_shuffle_f32x4(tmp10532, tmp10536, 221);
wt223 = _mm512_shuffle_f32x4(tmp10537, tmp10545, 136);
wt231 = _mm512_shuffle_f32x4(tmp10537, tmp10545, 221);
wt224 = _mm512_shuffle_f32x4(tmp10539, tmp10547, 136);
wt232 = _mm512_shuffle_f32x4(tmp10539, tmp10547, 221);
wt225 = _mm512_shuffle_f32x4(tmp10541, tmp10549, 136);
wt233 = _mm512_shuffle_f32x4(tmp10541, tmp10549, 221);
wt226 = _mm512_shuffle_f32x4(tmp10543, tmp10551, 136);
wt234 = _mm512_shuffle_f32x4(tmp10543, tmp10551, 221);
wt227 = _mm512_shuffle_f32x4(tmp10538, tmp10546, 136);
wt235 = _mm512_shuffle_f32x4(tmp10538, tmp10546, 221);
wt228 = _mm512_shuffle_f32x4(tmp10540, tmp10548, 136);
wt236 = _mm512_shuffle_f32x4(tmp10540, tmp10548, 221);
wt229 = _mm512_shuffle_f32x4(tmp10542, tmp10550, 136);
wt237 = _mm512_shuffle_f32x4(tmp10542, tmp10550, 221);
wt230 = _mm512_shuffle_f32x4(tmp10544, tmp10552, 136);
wt238 = _mm512_shuffle_f32x4(tmp10544, tmp10552, 221);
wt223 = _mm512_mul_ps(wt223, postMul25);
wt224 = _mm512_mul_ps(wt224, postMul25);
wt225 = _mm512_mul_ps(wt225, postMul25);
wt226 = _mm512_mul_ps(wt226, postMul25);
wt227 = _mm512_mul_ps(wt227, postMul25);
wt228 = _mm512_mul_ps(wt228, postMul25);
wt229 = _mm512_mul_ps(wt229, postMul25);
wt230 = _mm512_mul_ps(wt230, postMul25);
wt231 = _mm512_mul_ps(wt231, postMul25);
wt232 = _mm512_mul_ps(wt232, postMul25);
wt233 = _mm512_mul_ps(wt233, postMul25);
wt234 = _mm512_mul_ps(wt234, postMul25);
wt235 = _mm512_mul_ps(wt235, postMul25);
wt236 = _mm512_mul_ps(wt236, postMul25);
wt237 = _mm512_mul_ps(wt237, postMul25);
wt238 = _mm512_mul_ps(wt238, postMul25);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(1+16*c21)+(ptrdiff_t)0, 63>>cut10, wt223);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(2+16*c21)+(ptrdiff_t)0, 63>>cut10, wt224);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(3+16*c21)+(ptrdiff_t)0, 63>>cut10, wt225);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(4+16*c21)+(ptrdiff_t)0, 63>>cut10, wt226);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(5+16*c21)+(ptrdiff_t)0, 63>>cut10, wt227);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(6+16*c21)+(ptrdiff_t)0, 63>>cut10, wt228);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(7+16*c21)+(ptrdiff_t)0, 63>>cut10, wt229);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(8+16*c21)+(ptrdiff_t)0, 63>>cut10, wt230);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(9+16*c21)+(ptrdiff_t)0, 63>>cut10, wt231);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(10+16*c21)+(ptrdiff_t)0, 63>>cut10, wt232);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(11+16*c21)+(ptrdiff_t)0, 63>>cut10, wt233);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(12+16*c21)+(ptrdiff_t)0, 63>>cut10, wt234);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(13+16*c21)+(ptrdiff_t)0, 63>>cut10, wt235);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(14+16*c21)+(ptrdiff_t)0, 63>>cut10, wt236);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(15+16*c21)+(ptrdiff_t)0, 63>>cut10, wt237);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(16+16*c21)+(ptrdiff_t)0, 63>>cut10, wt238);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(1+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt223);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(2+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt224);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(3+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt225);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(4+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt226);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(5+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt227);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(6+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt228);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(7+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt229);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(8+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt230);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(9+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt231);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(10+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt232);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(11+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt233);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(12+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt234);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(13+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt235);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(14+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt236);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(15+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt237);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(16+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt238);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(1+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt223);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(2+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt224);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(3+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt225);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(4+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt226);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(5+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt227);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(6+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt228);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(7+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt229);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(8+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt230);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(9+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt231);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(10+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt232);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(11+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt233);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(12+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt234);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(13+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt235);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(14+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt236);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(15+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt237);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(16+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt238);
}
break;
}
default: {
cut10 = 4;
__m512 sum190 = _mm512_maskz_loadu_ps(65535, biasPtr8+2048*i31+4*k99);
__m512i pmMul15 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd15 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo12 = _mm512_loadu_ps(bnPtr9+(ptrdiff_t)8*(k99+512*i31));
__m512 masHi12 = _mm512_maskz_loadu_ps(65535, bnPtr9+(ptrdiff_t)8*(k99+512*i31)+(ptrdiff_t)64);
__m512 postMul26 = _mm512_permutex2var_ps(masLo12, pmMul15, masHi12);
__m512 postAdd16 = _mm512_permutex2var_ps(masLo12, pmAdd15, masHi12);
sum190 = _mm512_fmadd_ps(sum190, postMul26, postAdd16);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)0, 63>>cut10, sum190);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)6144, 4032>>cut10, sum190);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)12288, 258048>>cut10, sum190);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)18432, 65535-(262143>>cut10), sum190);
ptrdiff_t c22 = 0;
for (; c22 != 16; ++c22) {
__m512 wt239 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)0);
__m512 wt240 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)1024);
__m512 wt241 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)2048);
__m512 wt242 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)3072);
__m512 wt243 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)4096);
__m512 wt244 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)5120);
__m512 wt245 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)6144);
__m512 wt246 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)7168);
__m512 wt247 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)8192);
__m512 wt248 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)9216);
__m512 wt249 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)10240);
__m512 wt250 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)11264);
__m512 wt251 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)12288);
__m512 wt252 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)13312);
__m512 wt253 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)14336);
__m512 wt254 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)15360);
__m512 tmp10553 = _mm512_unpacklo_ps(wt239, wt240);
__m512 tmp10554 = _mm512_unpackhi_ps(wt239, wt240);
__m512 tmp10555 = _mm512_unpacklo_ps(wt241, wt242);
__m512 tmp10556 = _mm512_unpackhi_ps(wt241, wt242);
__m512 tmp10557 = _mm512_unpacklo_ps(wt243, wt244);
__m512 tmp10558 = _mm512_unpackhi_ps(wt243, wt244);
__m512 tmp10559 = _mm512_unpacklo_ps(wt245, wt246);
__m512 tmp10560 = _mm512_unpackhi_ps(wt245, wt246);
__m512 tmp10561 = _mm512_unpacklo_ps(wt247, wt248);
__m512 tmp10562 = _mm512_unpackhi_ps(wt247, wt248);
__m512 tmp10563 = _mm512_unpacklo_ps(wt249, wt250);
__m512 tmp10564 = _mm512_unpackhi_ps(wt249, wt250);
__m512 tmp10565 = _mm512_unpacklo_ps(wt251, wt252);
__m512 tmp10566 = _mm512_unpackhi_ps(wt251, wt252);
__m512 tmp10567 = _mm512_unpacklo_ps(wt253, wt254);
__m512 tmp10568 = _mm512_unpackhi_ps(wt253, wt254);
__m512 tmp10569 = _mm512_shuffle_ps(tmp10553, tmp10555, 68);
__m512 tmp10570 = _mm512_shuffle_ps(tmp10553, tmp10555, 238);
__m512 tmp10571 = _mm512_shuffle_ps(tmp10554, tmp10556, 68);
__m512 tmp10572 = _mm512_shuffle_ps(tmp10554, tmp10556, 238);
__m512 tmp10573 = _mm512_shuffle_ps(tmp10557, tmp10559, 68);
__m512 tmp10574 = _mm512_shuffle_ps(tmp10557, tmp10559, 238);
__m512 tmp10575 = _mm512_shuffle_ps(tmp10558, tmp10560, 68);
__m512 tmp10576 = _mm512_shuffle_ps(tmp10558, tmp10560, 238);
__m512 tmp10577 = _mm512_shuffle_ps(tmp10561, tmp10563, 68);
__m512 tmp10578 = _mm512_shuffle_ps(tmp10561, tmp10563, 238);
__m512 tmp10579 = _mm512_shuffle_ps(tmp10562, tmp10564, 68);
__m512 tmp10580 = _mm512_shuffle_ps(tmp10562, tmp10564, 238);
__m512 tmp10581 = _mm512_shuffle_ps(tmp10565, tmp10567, 68);
__m512 tmp10582 = _mm512_shuffle_ps(tmp10565, tmp10567, 238);
__m512 tmp10583 = _mm512_shuffle_ps(tmp10566, tmp10568, 68);
__m512 tmp10584 = _mm512_shuffle_ps(tmp10566, tmp10568, 238);
__m512 tmp10585 = _mm512_shuffle_f32x4(tmp10569, tmp10573, 136);
__m512 tmp10586 = _mm512_shuffle_f32x4(tmp10569, tmp10573, 221);
__m512 tmp10587 = _mm512_shuffle_f32x4(tmp10570, tmp10574, 136);
__m512 tmp10588 = _mm512_shuffle_f32x4(tmp10570, tmp10574, 221);
__m512 tmp10589 = _mm512_shuffle_f32x4(tmp10571, tmp10575, 136);
__m512 tmp10590 = _mm512_shuffle_f32x4(tmp10571, tmp10575, 221);
__m512 tmp10591 = _mm512_shuffle_f32x4(tmp10572, tmp10576, 136);
__m512 tmp10592 = _mm512_shuffle_f32x4(tmp10572, tmp10576, 221);
__m512 tmp10593 = _mm512_shuffle_f32x4(tmp10577, tmp10581, 136);
__m512 tmp10594 = _mm512_shuffle_f32x4(tmp10577, tmp10581, 221);
__m512 tmp10595 = _mm512_shuffle_f32x4(tmp10578, tmp10582, 136);
__m512 tmp10596 = _mm512_shuffle_f32x4(tmp10578, tmp10582, 221);
__m512 tmp10597 = _mm512_shuffle_f32x4(tmp10579, tmp10583, 136);
__m512 tmp10598 = _mm512_shuffle_f32x4(tmp10579, tmp10583, 221);
__m512 tmp10599 = _mm512_shuffle_f32x4(tmp10580, tmp10584, 136);
__m512 tmp10600 = _mm512_shuffle_f32x4(tmp10580, tmp10584, 221);
wt239 = _mm512_shuffle_f32x4(tmp10585, tmp10593, 136);
wt247 = _mm512_shuffle_f32x4(tmp10585, tmp10593, 221);
wt240 = _mm512_shuffle_f32x4(tmp10587, tmp10595, 136);
wt248 = _mm512_shuffle_f32x4(tmp10587, tmp10595, 221);
wt241 = _mm512_shuffle_f32x4(tmp10589, tmp10597, 136);
wt249 = _mm512_shuffle_f32x4(tmp10589, tmp10597, 221);
wt242 = _mm512_shuffle_f32x4(tmp10591, tmp10599, 136);
wt250 = _mm512_shuffle_f32x4(tmp10591, tmp10599, 221);
wt243 = _mm512_shuffle_f32x4(tmp10586, tmp10594, 136);
wt251 = _mm512_shuffle_f32x4(tmp10586, tmp10594, 221);
wt244 = _mm512_shuffle_f32x4(tmp10588, tmp10596, 136);
wt252 = _mm512_shuffle_f32x4(tmp10588, tmp10596, 221);
wt245 = _mm512_shuffle_f32x4(tmp10590, tmp10598, 136);
wt253 = _mm512_shuffle_f32x4(tmp10590, tmp10598, 221);
wt246 = _mm512_shuffle_f32x4(tmp10592, tmp10600, 136);
wt254 = _mm512_shuffle_f32x4(tmp10592, tmp10600, 221);
wt239 = _mm512_mul_ps(wt239, postMul26);
wt240 = _mm512_mul_ps(wt240, postMul26);
wt241 = _mm512_mul_ps(wt241, postMul26);
wt242 = _mm512_mul_ps(wt242, postMul26);
wt243 = _mm512_mul_ps(wt243, postMul26);
wt244 = _mm512_mul_ps(wt244, postMul26);
wt245 = _mm512_mul_ps(wt245, postMul26);
wt246 = _mm512_mul_ps(wt246, postMul26);
wt247 = _mm512_mul_ps(wt247, postMul26);
wt248 = _mm512_mul_ps(wt248, postMul26);
wt249 = _mm512_mul_ps(wt249, postMul26);
wt250 = _mm512_mul_ps(wt250, postMul26);
wt251 = _mm512_mul_ps(wt251, postMul26);
wt252 = _mm512_mul_ps(wt252, postMul26);
wt253 = _mm512_mul_ps(wt253, postMul26);
wt254 = _mm512_mul_ps(wt254, postMul26);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(1+16*c22)+(ptrdiff_t)0, 63>>cut10, wt239);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(2+16*c22)+(ptrdiff_t)0, 63>>cut10, wt240);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(3+16*c22)+(ptrdiff_t)0, 63>>cut10, wt241);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(4+16*c22)+(ptrdiff_t)0, 63>>cut10, wt242);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(5+16*c22)+(ptrdiff_t)0, 63>>cut10, wt243);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(6+16*c22)+(ptrdiff_t)0, 63>>cut10, wt244);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(7+16*c22)+(ptrdiff_t)0, 63>>cut10, wt245);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(8+16*c22)+(ptrdiff_t)0, 63>>cut10, wt246);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(9+16*c22)+(ptrdiff_t)0, 63>>cut10, wt247);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(10+16*c22)+(ptrdiff_t)0, 63>>cut10, wt248);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(11+16*c22)+(ptrdiff_t)0, 63>>cut10, wt249);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(12+16*c22)+(ptrdiff_t)0, 63>>cut10, wt250);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(13+16*c22)+(ptrdiff_t)0, 63>>cut10, wt251);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(14+16*c22)+(ptrdiff_t)0, 63>>cut10, wt252);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(15+16*c22)+(ptrdiff_t)0, 63>>cut10, wt253);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(16+16*c22)+(ptrdiff_t)0, 63>>cut10, wt254);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(1+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt239);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(2+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt240);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(3+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt241);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(4+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt242);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(5+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt243);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(6+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt244);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(7+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt245);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(8+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt246);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(9+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt247);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(10+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt248);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(11+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt249);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(12+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt250);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(13+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt251);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(14+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt252);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(15+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt253);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(16+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt254);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(1+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt239);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(2+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt240);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(3+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt241);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(4+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt242);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(5+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt243);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(6+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt244);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(7+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt245);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(8+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt246);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(9+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt247);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(10+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt248);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(11+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt249);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(12+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt250);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(13+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt251);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(14+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt252);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(15+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt253);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(16+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt254);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(1+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt239);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(2+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt240);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(3+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt241);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(4+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt242);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(5+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt243);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(6+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt244);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(7+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt245);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(8+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt246);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(9+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt247);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(10+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt248);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(11+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt249);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(12+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt250);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(13+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt251);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(14+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt252);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(15+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt253);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(16+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt254);
}
}
}
} else {
ptrdiff_t k98 = 496;
ptrdiff_t l37 = (size_t)(0+k98)/6;
ptrdiff_t cut9 = (size_t)(0+k98)%6;
__m512 sum188 = _mm512_maskz_loadu_ps(65535, biasPtr8+2048*i31+4*k98);
__m512i pmMul16 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd16 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo13 = _mm512_loadu_ps(bnPtr9+(ptrdiff_t)8*(k98+512*i31));
__m512 masHi13 = _mm512_maskz_loadu_ps(65535, bnPtr9+(ptrdiff_t)8*(k98+512*i31)+(ptrdiff_t)64);
__m512 postMul24 = _mm512_permutex2var_ps(masLo13, pmMul16, masHi13);
__m512 postAdd14 = _mm512_permutex2var_ps(masLo13, pmAdd16, masHi13);
sum188 = _mm512_fmadd_ps(sum188, postMul24, postAdd14);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*0+(ptrdiff_t)0, 63>>cut9, sum188);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*0+(ptrdiff_t)6144, 4032>>cut9, sum188);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*0+(ptrdiff_t)12288, 258048>>cut9, sum188);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*0+(ptrdiff_t)18432, 65535-(262143>>cut9), sum188);
ptrdiff_t c20 = 0;
for (; c20 != 16; ++c20) {
__m512 wt207 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)0);
__m512 wt208 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)1024);
__m512 wt209 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)2048);
__m512 wt210 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)3072);
__m512 wt211 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)4096);
__m512 wt212 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)5120);
__m512 wt213 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)6144);
__m512 wt214 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)7168);
__m512 wt215 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)8192);
__m512 wt216 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)9216);
__m512 wt217 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)10240);
__m512 wt218 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)11264);
__m512 wt219 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)12288);
__m512 wt220 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)13312);
__m512 wt221 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)14336);
__m512 wt222 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)15360);
__m512 tmp10601 = _mm512_unpacklo_ps(wt207, wt208);
__m512 tmp10602 = _mm512_unpackhi_ps(wt207, wt208);
__m512 tmp10603 = _mm512_unpacklo_ps(wt209, wt210);
__m512 tmp10604 = _mm512_unpackhi_ps(wt209, wt210);
__m512 tmp10605 = _mm512_unpacklo_ps(wt211, wt212);
__m512 tmp10606 = _mm512_unpackhi_ps(wt211, wt212);
__m512 tmp10607 = _mm512_unpacklo_ps(wt213, wt214);
__m512 tmp10608 = _mm512_unpackhi_ps(wt213, wt214);
__m512 tmp10609 = _mm512_unpacklo_ps(wt215, wt216);
__m512 tmp10610 = _mm512_unpackhi_ps(wt215, wt216);
__m512 tmp10611 = _mm512_unpacklo_ps(wt217, wt218);
__m512 tmp10612 = _mm512_unpackhi_ps(wt217, wt218);
__m512 tmp10613 = _mm512_unpacklo_ps(wt219, wt220);
__m512 tmp10614 = _mm512_unpackhi_ps(wt219, wt220);
__m512 tmp10615 = _mm512_unpacklo_ps(wt221, wt222);
__m512 tmp10616 = _mm512_unpackhi_ps(wt221, wt222);
__m512 tmp10617 = _mm512_shuffle_ps(tmp10601, tmp10603, 68);
__m512 tmp10618 = _mm512_shuffle_ps(tmp10601, tmp10603, 238);
__m512 tmp10619 = _mm512_shuffle_ps(tmp10602, tmp10604, 68);
__m512 tmp10620 = _mm512_shuffle_ps(tmp10602, tmp10604, 238);
__m512 tmp10621 = _mm512_shuffle_ps(tmp10605, tmp10607, 68);
__m512 tmp10622 = _mm512_shuffle_ps(tmp10605, tmp10607, 238);
__m512 tmp10623 = _mm512_shuffle_ps(tmp10606, tmp10608, 68);
__m512 tmp10624 = _mm512_shuffle_ps(tmp10606, tmp10608, 238);
__m512 tmp10625 = _mm512_shuffle_ps(tmp10609, tmp10611, 68);
__m512 tmp10626 = _mm512_shuffle_ps(tmp10609, tmp10611, 238);
__m512 tmp10627 = _mm512_shuffle_ps(tmp10610, tmp10612, 68);
__m512 tmp10628 = _mm512_shuffle_ps(tmp10610, tmp10612, 238);
__m512 tmp10629 = _mm512_shuffle_ps(tmp10613, tmp10615, 68);
__m512 tmp10630 = _mm512_shuffle_ps(tmp10613, tmp10615, 238);
__m512 tmp10631 = _mm512_shuffle_ps(tmp10614, tmp10616, 68);
__m512 tmp10632 = _mm512_shuffle_ps(tmp10614, tmp10616, 238);
__m512 tmp10633 = _mm512_shuffle_f32x4(tmp10617, tmp10621, 136);
__m512 tmp10634 = _mm512_shuffle_f32x4(tmp10617, tmp10621, 221);
__m512 tmp10635 = _mm512_shuffle_f32x4(tmp10618, tmp10622, 136);
__m512 tmp10636 = _mm512_shuffle_f32x4(tmp10618, tmp10622, 221);
__m512 tmp10637 = _mm512_shuffle_f32x4(tmp10619, tmp10623, 136);
__m512 tmp10638 = _mm512_shuffle_f32x4(tmp10619, tmp10623, 221);
__m512 tmp10639 = _mm512_shuffle_f32x4(tmp10620, tmp10624, 136);
__m512 tmp10640 = _mm512_shuffle_f32x4(tmp10620, tmp10624, 221);
__m512 tmp10641 = _mm512_shuffle_f32x4(tmp10625, tmp10629, 136);
__m512 tmp10642 = _mm512_shuffle_f32x4(tmp10625, tmp10629, 221);
__m512 tmp10643 = _mm512_shuffle_f32x4(tmp10626, tmp10630, 136);
__m512 tmp10644 = _mm512_shuffle_f32x4(tmp10626, tmp10630, 221);
__m512 tmp10645 = _mm512_shuffle_f32x4(tmp10627, tmp10631, 136);
__m512 tmp10646 = _mm512_shuffle_f32x4(tmp10627, tmp10631, 221);
__m512 tmp10647 = _mm512_shuffle_f32x4(tmp10628, tmp10632, 136);
__m512 tmp10648 = _mm512_shuffle_f32x4(tmp10628, tmp10632, 221);
wt207 = _mm512_shuffle_f32x4(tmp10633, tmp10641, 136);
wt215 = _mm512_shuffle_f32x4(tmp10633, tmp10641, 221);
wt208 = _mm512_shuffle_f32x4(tmp10635, tmp10643, 136);
wt216 = _mm512_shuffle_f32x4(tmp10635, tmp10643, 221);
wt209 = _mm512_shuffle_f32x4(tmp10637, tmp10645, 136);
wt217 = _mm512_shuffle_f32x4(tmp10637, tmp10645, 221);
wt210 = _mm512_shuffle_f32x4(tmp10639, tmp10647, 136);
wt218 = _mm512_shuffle_f32x4(tmp10639, tmp10647, 221);
wt211 = _mm512_shuffle_f32x4(tmp10634, tmp10642, 136);
wt219 = _mm512_shuffle_f32x4(tmp10634, tmp10642, 221);
wt212 = _mm512_shuffle_f32x4(tmp10636, tmp10644, 136);
wt220 = _mm512_shuffle_f32x4(tmp10636, tmp10644, 221);
wt213 = _mm512_shuffle_f32x4(tmp10638, tmp10646, 136);
wt221 = _mm512_shuffle_f32x4(tmp10638, tmp10646, 221);
wt214 = _mm512_shuffle_f32x4(tmp10640, tmp10648, 136);
wt222 = _mm512_shuffle_f32x4(tmp10640, tmp10648, 221);
wt207 = _mm512_mul_ps(wt207, postMul24);
wt208 = _mm512_mul_ps(wt208, postMul24);
wt209 = _mm512_mul_ps(wt209, postMul24);
wt210 = _mm512_mul_ps(wt210, postMul24);
wt211 = _mm512_mul_ps(wt211, postMul24);
wt212 = _mm512_mul_ps(wt212, postMul24);
wt213 = _mm512_mul_ps(wt213, postMul24);
wt214 = _mm512_mul_ps(wt214, postMul24);
wt215 = _mm512_mul_ps(wt215, postMul24);
wt216 = _mm512_mul_ps(wt216, postMul24);
wt217 = _mm512_mul_ps(wt217, postMul24);
wt218 = _mm512_mul_ps(wt218, postMul24);
wt219 = _mm512_mul_ps(wt219, postMul24);
wt220 = _mm512_mul_ps(wt220, postMul24);
wt221 = _mm512_mul_ps(wt221, postMul24);
wt222 = _mm512_mul_ps(wt222, postMul24);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(1+16*c20)+(ptrdiff_t)0, 63>>cut9, wt207);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(2+16*c20)+(ptrdiff_t)0, 63>>cut9, wt208);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(3+16*c20)+(ptrdiff_t)0, 63>>cut9, wt209);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(4+16*c20)+(ptrdiff_t)0, 63>>cut9, wt210);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(5+16*c20)+(ptrdiff_t)0, 63>>cut9, wt211);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(6+16*c20)+(ptrdiff_t)0, 63>>cut9, wt212);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(7+16*c20)+(ptrdiff_t)0, 63>>cut9, wt213);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(8+16*c20)+(ptrdiff_t)0, 63>>cut9, wt214);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(9+16*c20)+(ptrdiff_t)0, 63>>cut9, wt215);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(10+16*c20)+(ptrdiff_t)0, 63>>cut9, wt216);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(11+16*c20)+(ptrdiff_t)0, 63>>cut9, wt217);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(12+16*c20)+(ptrdiff_t)0, 63>>cut9, wt218);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(13+16*c20)+(ptrdiff_t)0, 63>>cut9, wt219);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(14+16*c20)+(ptrdiff_t)0, 63>>cut9, wt220);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(15+16*c20)+(ptrdiff_t)0, 63>>cut9, wt221);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(16+16*c20)+(ptrdiff_t)0, 63>>cut9, wt222);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(1+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt207);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(2+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt208);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(3+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt209);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(4+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt210);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(5+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt211);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(6+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt212);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(7+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt213);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(8+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt214);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(9+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt215);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(10+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt216);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(11+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt217);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(12+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt218);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(13+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt219);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(14+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt220);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(15+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt221);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(16+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt222);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(1+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt207);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(2+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt208);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(3+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt209);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(4+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt210);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(5+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt211);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(6+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt212);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(7+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt213);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(8+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt214);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(9+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt215);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(10+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt216);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(11+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt217);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(12+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt218);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(13+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt219);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(14+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt220);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(15+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt221);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(16+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt222);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(1+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt207);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(2+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt208);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(3+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt209);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(4+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt210);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(5+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt211);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(6+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt212);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(7+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt213);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(8+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt214);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(9+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt215);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(10+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt216);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(11+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt217);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(12+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt218);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(13+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt219);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(14+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt220);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(15+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt221);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(16+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt222);
}
}
}
}
}

static void ResNeXt50OneArrangeWts4(ResNeXt50ThreaderTeam1* team36, char** tensors45) {
ResNeXt50ThreaderTask1 task49;
task49.callee1 = ResNeXt50OneArrangeWts4Callee1;
task49.any1 = tensors45;
task49.nd1 = 3;
task49.hull1[0] = 16;
task49.hull1[1] = 1;
task49.hull1[2] = 1;
ResNeXt50ThreaderDo1(team36, &task49);
}

static void ResNeXt50OneArrangeDats4Callee1(ResNeXt50ThreaderTask1* task50, int64_t* pt30) {
char** tensors48 = task50->any1;
ptrdiff_t s20 = pt30[0];
ptrdiff_t c23 = pt30[1];
char*restrict datPtr14 = tensors48[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)3227648*0;
char*restrict arranged8 = tensors48[1]+(ptrdiff_t)2992640*0+(ptrdiff_t)917504*0;
ptrdiff_t ii19 = 1;
for (ptrdiff_t i32 = 0; i32 < ii19; ++i32) {
ptrdiff_t j25 = 1*c23;
ptrdiff_t jj31 = j25+0;
ptrdiff_t h38 = 0+((size_t)j25-0)/1*4;
switch (((size_t)j25-0)%1) {
default: {
wrap3:;
ptrdiff_t k100 = 128*s20;
ptrdiff_t kk30 = k100+128;
for (; k100 < kk30; ++k100) {
__m512 dat1641 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k100+(ptrdiff_t)0);
__m512 dat1642 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k100+(ptrdiff_t)64);
__m512i pm153 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat1643 = _mm512_permutex2var_ps(dat1641, pm153, dat1642);
__m512 dat1644 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k100+(ptrdiff_t)128);
__m512 dat1645 = _mm512_maskz_loadu_ps(127, datPtr14+3227648*i32+224*h38+12608*k100+(ptrdiff_t)192);
__m512i pm154 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat1646 = _mm512_permutex2var_ps(dat1644, pm154, dat1645);
__m512 dat1647 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k100+(ptrdiff_t)448);
__m512 dat1648 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k100+(ptrdiff_t)512);
__m512i pm155 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat1649 = _mm512_permutex2var_ps(dat1647, pm155, dat1648);
__m512 dat1650 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k100+(ptrdiff_t)576);
__m512 dat1651 = _mm512_maskz_loadu_ps(127, datPtr14+3227648*i32+224*h38+12608*k100+(ptrdiff_t)640);
__m512i pm156 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat1652 = _mm512_permutex2var_ps(dat1650, pm156, dat1651);
_mm512_storeu_ps(arranged8+917504*i32+65536*j25+256*k100+(ptrdiff_t)0, dat1643);
_mm512_storeu_ps(arranged8+917504*i32+65536*j25+256*k100+(ptrdiff_t)64, dat1646);
_mm512_storeu_ps(arranged8+917504*i32+65536*j25+256*k100+(ptrdiff_t)128, dat1649);
_mm512_storeu_ps(arranged8+917504*i32+65536*j25+256*k100+(ptrdiff_t)192, dat1652);
}
if (j25 >= jj31) goto next4;
if (j25 >= 13) break;
++j25;
h38 += 4;
goto wrap3;
}
}
j25 = 14;
next4:;
}
}

static void ResNeXt50OneArrangeDats4(ResNeXt50ThreaderTeam1* team37, char** tensors47) {
ResNeXt50ThreaderTask1 task51;
task51.callee1 = ResNeXt50OneArrangeDats4Callee1;
task51.any1 = tensors47;
task51.nd1 = 4;
task51.hull1[0] = 2;
task51.hull1[1] = 14;
task51.hull1[2] = 1;
task51.hull1[3] = 1;
ResNeXt50ThreaderDo1(team37, &task51);
}

static void ResNeXt50OneApply4Callee1(ResNeXt50ThreaderTask1* task52, int64_t* pt31) {
void** pair12 = task52->any1;
char** tensors50 = pair12[0];
ptrdiff_t e15 = 0;
ptrdiff_t g17 = 0;
ptrdiff_t d10 = pt31[1];
ptrdiff_t w47 = pt31[0];
char*restrict arrangedWts4 = tensors50[0]+1712128*e15+(ptrdiff_t)526336*1*g17;
char*restrict arrangedDats4 = tensors50[1]+2992640*e15+(ptrdiff_t)917504*1*g17;
char*restrict datPtr15 = tensors50[2]+(ptrdiff_t)1605632*1*g17;
ptrdiff_t ii20 = 1;
for (ptrdiff_t i33 = 0; i33 < ii20; ++i33) {
ptrdiff_t j26 = 1*d10;
ptrdiff_t jj32 = j26+0;
ptrdiff_t h39 = 0+((size_t)j26-0)/1*2;
switch (((size_t)j26-0)%1) {
default: {
wrap4:;
ptrdiff_t k101 = 2*w47;
ptrdiff_t kk31 = k101+1;
for (; k101 != 85; ++k101) {
ptrdiff_t s21 = -1;
__m512 sum191 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)24));
__m512 sum195 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)28));
__m512 sum199 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)32));
__m512 sum203 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)36));
__m512 sum207 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)40));
__m512 sum211 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)44));
__m512 sum192 = sum191;
__m512 sum193 = sum191;
__m512 sum194 = sum191;
__m512 sum196 = sum195;
__m512 sum197 = sum195;
__m512 sum198 = sum195;
__m512 sum200 = sum199;
__m512 sum201 = sum199;
__m512 sum202 = sum199;
__m512 sum204 = sum203;
__m512 sum205 = sum203;
__m512 sum206 = sum203;
__m512 sum208 = sum207;
__m512 sum209 = sum207;
__m512 sum210 = sum207;
__m512 sum212 = sum211;
__m512 sum213 = sum211;
__m512 sum214 = sum211;
for (s21 = 0; s21 < 256; ++s21) {
__m512 dat1653 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s21+(ptrdiff_t)0);
__m512 dat1654 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s21+(ptrdiff_t)64);
__m512 dat1655 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s21+(ptrdiff_t)128);
__m512 dat1656 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s21+(ptrdiff_t)192);
__m512 wt255 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)24));
sum191 = _mm512_fmadd_ps(wt255, dat1653, sum191);
sum192 = _mm512_fmadd_ps(wt255, dat1654, sum192);
sum193 = _mm512_fmadd_ps(wt255, dat1655, sum193);
sum194 = _mm512_fmadd_ps(wt255, dat1656, sum194);
__m512 wt256 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)28));
sum195 = _mm512_fmadd_ps(wt256, dat1653, sum195);
sum196 = _mm512_fmadd_ps(wt256, dat1654, sum196);
sum197 = _mm512_fmadd_ps(wt256, dat1655, sum197);
sum198 = _mm512_fmadd_ps(wt256, dat1656, sum198);
__m512 wt257 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)32));
sum199 = _mm512_fmadd_ps(wt257, dat1653, sum199);
sum200 = _mm512_fmadd_ps(wt257, dat1654, sum200);
sum201 = _mm512_fmadd_ps(wt257, dat1655, sum201);
sum202 = _mm512_fmadd_ps(wt257, dat1656, sum202);
__m512 wt258 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)36));
sum203 = _mm512_fmadd_ps(wt258, dat1653, sum203);
sum204 = _mm512_fmadd_ps(wt258, dat1654, sum204);
sum205 = _mm512_fmadd_ps(wt258, dat1655, sum205);
sum206 = _mm512_fmadd_ps(wt258, dat1656, sum206);
__m512 wt259 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)40));
sum207 = _mm512_fmadd_ps(wt259, dat1653, sum207);
sum208 = _mm512_fmadd_ps(wt259, dat1654, sum208);
sum209 = _mm512_fmadd_ps(wt259, dat1655, sum209);
sum210 = _mm512_fmadd_ps(wt259, dat1656, sum210);
__m512 wt260 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)44));
sum211 = _mm512_fmadd_ps(wt260, dat1653, sum211);
sum212 = _mm512_fmadd_ps(wt260, dat1654, sum212);
sum213 = _mm512_fmadd_ps(wt260, dat1655, sum213);
sum214 = _mm512_fmadd_ps(wt260, dat1656, sum214);
}
__m512 dat1657 = sum192;
__m512 dat1658 = sum194;
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)0, 65535, sum191);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)64, 4095, dat1657);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)112, 65535, sum193);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)176, 4095, dat1658);
__m512 dat1659 = sum196;
__m512 dat1660 = sum198;
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)3136, 65535, sum195);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)3200, 4095, dat1659);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)3248, 65535, sum197);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)3312, 4095, dat1660);
__m512 dat1661 = sum200;
__m512 dat1662 = sum202;
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)6272, 65535, sum199);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)6336, 4095, dat1661);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)6384, 65535, sum201);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)6448, 4095, dat1662);
__m512 dat1663 = sum204;
__m512 dat1664 = sum206;
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)9408, 65535, sum203);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)9472, 4095, dat1663);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)9520, 65535, sum205);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)9584, 4095, dat1664);
__m512 dat1665 = sum208;
__m512 dat1666 = sum210;
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)12544, 65535, sum207);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)12608, 4095, dat1665);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)12656, 65535, sum209);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)12720, 4095, dat1666);
__m512 dat1667 = sum212;
__m512 dat1668 = sum214;
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)15680, 65535, sum211);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)15744, 4095, dat1667);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)15792, 65535, sum213);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)15856, 4095, dat1668);
if (k101 >= kk31) return;
}
ptrdiff_t s22 = -1;
__m512 sum215 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+8*s22+(ptrdiff_t)8));
__m512 sum219 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+8*s22+(ptrdiff_t)12));
__m512 sum216 = sum215;
__m512 sum217 = sum215;
__m512 sum218 = sum215;
__m512 sum220 = sum219;
__m512 sum221 = sum219;
__m512 sum222 = sum219;
for (s22 = 0; s22 < 256; ++s22) {
__m512 dat1669 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s22+(ptrdiff_t)0);
__m512 dat1670 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s22+(ptrdiff_t)64);
__m512 dat1671 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s22+(ptrdiff_t)128);
__m512 dat1672 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s22+(ptrdiff_t)192);
__m512 wt261 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+8*s22+(ptrdiff_t)8));
sum215 = _mm512_fmadd_ps(wt261, dat1669, sum215);
sum216 = _mm512_fmadd_ps(wt261, dat1670, sum216);
sum217 = _mm512_fmadd_ps(wt261, dat1671, sum217);
sum218 = _mm512_fmadd_ps(wt261, dat1672, sum218);
__m512 wt262 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+8*s22+(ptrdiff_t)12));
sum219 = _mm512_fmadd_ps(wt262, dat1669, sum219);
sum220 = _mm512_fmadd_ps(wt262, dat1670, sum220);
sum221 = _mm512_fmadd_ps(wt262, dat1671, sum221);
sum222 = _mm512_fmadd_ps(wt262, dat1672, sum222);
}
__m512 dat1673 = sum216;
__m512 dat1674 = sum218;
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)0, 65535, sum215);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)64, 4095, dat1673);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)112, 65535, sum217);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)176, 4095, dat1674);
__m512 dat1675 = sum220;
__m512 dat1676 = sum222;
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)3136, 65535, sum219);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)3200, 4095, dat1675);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)3248, 65535, sum221);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)3312, 4095, dat1676);
if (j26 >= jj32) return;
if (j26 >= 13) break;
++j26;
h39 += 2;
goto wrap4;
}
}
j26 = 14;
}
}

static void ResNeXt50OneApply4(ResNeXt50ThreaderTeam1* team38, char** tensors49) {
void* pair11[] = {tensors49, 0};
ResNeXt50ThreaderTask1 task53;
task53.callee1 = ResNeXt50OneApply4Callee1;
task53.any1 = pair11;
task53.nd1 = 3;
task53.hull1[0] = 43;
task53.hull1[1] = 14;
task53.hull1[2] = 1;
ResNeXt50ThreaderDo1(team38, &task53);
}

static void ResNeXt50OneArrangeWts5Callee1(ResNeXt50ThreaderTask1* task54, int64_t* pt32) {
char** tensors52 = task54->any1;
ptrdiff_t b54 = pt32[0];
char*restrict wtPtr9 = tensors52[0]+(ptrdiff_t)3340*0+(ptrdiff_t)262144*0;
char*restrict biasPtr9 = tensors52[1]+(ptrdiff_t)1024*0;
char*restrict bnPtr10 = tensors52[2]+(ptrdiff_t)8*256*0;
char*restrict arranged9 = tensors52[3]+(ptrdiff_t)856064*0+(ptrdiff_t)263168*0;
ptrdiff_t ii21 = 1;
for (ptrdiff_t i34 = 0; i34 < ii21; ++i34) {
ptrdiff_t j27 = 2*b54;
ptrdiff_t jj33 = j27+2;
for (; j27 < jj33; ++j27) {
if (j27 < 15) {
ptrdiff_t k103 = 0+16*(j27-0);
ptrdiff_t l40 = (size_t)(0+k103)/6;
ptrdiff_t cut12 = (size_t)(0+k103)%6;
switch (cut12) {
case 0:;
case 2: {
__m512 sum224 = _mm512_maskz_loadu_ps(65535, biasPtr9+1024*i34+4*k103);
__m512i pmMul17 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd17 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo14 = _mm512_loadu_ps(bnPtr10+(ptrdiff_t)8*(k103+256*i34));
__m512 masHi14 = _mm512_maskz_loadu_ps(65535, bnPtr10+(ptrdiff_t)8*(k103+256*i34)+(ptrdiff_t)64);
__m512 postMul28 = _mm512_permutex2var_ps(masLo14, pmMul17, masHi14);
__m512 postAdd18 = _mm512_permutex2var_ps(masLo14, pmAdd17, masHi14);
sum224 = _mm512_fmadd_ps(sum224, postMul28, postAdd18);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*0+(ptrdiff_t)0, 63>>cut12, sum224);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*0+(ptrdiff_t)6144, 4032>>cut12, sum224);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*0+(ptrdiff_t)12288, 65535-(4095>>cut12), sum224);
ptrdiff_t c25 = 0;
for (; c25 != 16; ++c25) {
__m512 wt279 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)0);
__m512 wt280 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)1024);
__m512 wt281 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)2048);
__m512 wt282 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)3072);
__m512 wt283 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)4096);
__m512 wt284 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)5120);
__m512 wt285 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)6144);
__m512 wt286 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)7168);
__m512 wt287 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)8192);
__m512 wt288 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)9216);
__m512 wt289 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)10240);
__m512 wt290 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)11264);
__m512 wt291 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)12288);
__m512 wt292 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)13312);
__m512 wt293 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)14336);
__m512 wt294 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)15360);
__m512 tmp10649 = _mm512_unpacklo_ps(wt279, wt280);
__m512 tmp10650 = _mm512_unpackhi_ps(wt279, wt280);
__m512 tmp10651 = _mm512_unpacklo_ps(wt281, wt282);
__m512 tmp10652 = _mm512_unpackhi_ps(wt281, wt282);
__m512 tmp10653 = _mm512_unpacklo_ps(wt283, wt284);
__m512 tmp10654 = _mm512_unpackhi_ps(wt283, wt284);
__m512 tmp10655 = _mm512_unpacklo_ps(wt285, wt286);
__m512 tmp10656 = _mm512_unpackhi_ps(wt285, wt286);
__m512 tmp10657 = _mm512_unpacklo_ps(wt287, wt288);
__m512 tmp10658 = _mm512_unpackhi_ps(wt287, wt288);
__m512 tmp10659 = _mm512_unpacklo_ps(wt289, wt290);
__m512 tmp10660 = _mm512_unpackhi_ps(wt289, wt290);
__m512 tmp10661 = _mm512_unpacklo_ps(wt291, wt292);
__m512 tmp10662 = _mm512_unpackhi_ps(wt291, wt292);
__m512 tmp10663 = _mm512_unpacklo_ps(wt293, wt294);
__m512 tmp10664 = _mm512_unpackhi_ps(wt293, wt294);
__m512 tmp10665 = _mm512_shuffle_ps(tmp10649, tmp10651, 68);
__m512 tmp10666 = _mm512_shuffle_ps(tmp10649, tmp10651, 238);
__m512 tmp10667 = _mm512_shuffle_ps(tmp10650, tmp10652, 68);
__m512 tmp10668 = _mm512_shuffle_ps(tmp10650, tmp10652, 238);
__m512 tmp10669 = _mm512_shuffle_ps(tmp10653, tmp10655, 68);
__m512 tmp10670 = _mm512_shuffle_ps(tmp10653, tmp10655, 238);
__m512 tmp10671 = _mm512_shuffle_ps(tmp10654, tmp10656, 68);
__m512 tmp10672 = _mm512_shuffle_ps(tmp10654, tmp10656, 238);
__m512 tmp10673 = _mm512_shuffle_ps(tmp10657, tmp10659, 68);
__m512 tmp10674 = _mm512_shuffle_ps(tmp10657, tmp10659, 238);
__m512 tmp10675 = _mm512_shuffle_ps(tmp10658, tmp10660, 68);
__m512 tmp10676 = _mm512_shuffle_ps(tmp10658, tmp10660, 238);
__m512 tmp10677 = _mm512_shuffle_ps(tmp10661, tmp10663, 68);
__m512 tmp10678 = _mm512_shuffle_ps(tmp10661, tmp10663, 238);
__m512 tmp10679 = _mm512_shuffle_ps(tmp10662, tmp10664, 68);
__m512 tmp10680 = _mm512_shuffle_ps(tmp10662, tmp10664, 238);
__m512 tmp10681 = _mm512_shuffle_f32x4(tmp10665, tmp10669, 136);
__m512 tmp10682 = _mm512_shuffle_f32x4(tmp10665, tmp10669, 221);
__m512 tmp10683 = _mm512_shuffle_f32x4(tmp10666, tmp10670, 136);
__m512 tmp10684 = _mm512_shuffle_f32x4(tmp10666, tmp10670, 221);
__m512 tmp10685 = _mm512_shuffle_f32x4(tmp10667, tmp10671, 136);
__m512 tmp10686 = _mm512_shuffle_f32x4(tmp10667, tmp10671, 221);
__m512 tmp10687 = _mm512_shuffle_f32x4(tmp10668, tmp10672, 136);
__m512 tmp10688 = _mm512_shuffle_f32x4(tmp10668, tmp10672, 221);
__m512 tmp10689 = _mm512_shuffle_f32x4(tmp10673, tmp10677, 136);
__m512 tmp10690 = _mm512_shuffle_f32x4(tmp10673, tmp10677, 221);
__m512 tmp10691 = _mm512_shuffle_f32x4(tmp10674, tmp10678, 136);
__m512 tmp10692 = _mm512_shuffle_f32x4(tmp10674, tmp10678, 221);
__m512 tmp10693 = _mm512_shuffle_f32x4(tmp10675, tmp10679, 136);
__m512 tmp10694 = _mm512_shuffle_f32x4(tmp10675, tmp10679, 221);
__m512 tmp10695 = _mm512_shuffle_f32x4(tmp10676, tmp10680, 136);
__m512 tmp10696 = _mm512_shuffle_f32x4(tmp10676, tmp10680, 221);
wt279 = _mm512_shuffle_f32x4(tmp10681, tmp10689, 136);
wt287 = _mm512_shuffle_f32x4(tmp10681, tmp10689, 221);
wt280 = _mm512_shuffle_f32x4(tmp10683, tmp10691, 136);
wt288 = _mm512_shuffle_f32x4(tmp10683, tmp10691, 221);
wt281 = _mm512_shuffle_f32x4(tmp10685, tmp10693, 136);
wt289 = _mm512_shuffle_f32x4(tmp10685, tmp10693, 221);
wt282 = _mm512_shuffle_f32x4(tmp10687, tmp10695, 136);
wt290 = _mm512_shuffle_f32x4(tmp10687, tmp10695, 221);
wt283 = _mm512_shuffle_f32x4(tmp10682, tmp10690, 136);
wt291 = _mm512_shuffle_f32x4(tmp10682, tmp10690, 221);
wt284 = _mm512_shuffle_f32x4(tmp10684, tmp10692, 136);
wt292 = _mm512_shuffle_f32x4(tmp10684, tmp10692, 221);
wt285 = _mm512_shuffle_f32x4(tmp10686, tmp10694, 136);
wt293 = _mm512_shuffle_f32x4(tmp10686, tmp10694, 221);
wt286 = _mm512_shuffle_f32x4(tmp10688, tmp10696, 136);
wt294 = _mm512_shuffle_f32x4(tmp10688, tmp10696, 221);
wt279 = _mm512_mul_ps(wt279, postMul28);
wt280 = _mm512_mul_ps(wt280, postMul28);
wt281 = _mm512_mul_ps(wt281, postMul28);
wt282 = _mm512_mul_ps(wt282, postMul28);
wt283 = _mm512_mul_ps(wt283, postMul28);
wt284 = _mm512_mul_ps(wt284, postMul28);
wt285 = _mm512_mul_ps(wt285, postMul28);
wt286 = _mm512_mul_ps(wt286, postMul28);
wt287 = _mm512_mul_ps(wt287, postMul28);
wt288 = _mm512_mul_ps(wt288, postMul28);
wt289 = _mm512_mul_ps(wt289, postMul28);
wt290 = _mm512_mul_ps(wt290, postMul28);
wt291 = _mm512_mul_ps(wt291, postMul28);
wt292 = _mm512_mul_ps(wt292, postMul28);
wt293 = _mm512_mul_ps(wt293, postMul28);
wt294 = _mm512_mul_ps(wt294, postMul28);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(1+16*c25)+(ptrdiff_t)0, 63>>cut12, wt279);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(2+16*c25)+(ptrdiff_t)0, 63>>cut12, wt280);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(3+16*c25)+(ptrdiff_t)0, 63>>cut12, wt281);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(4+16*c25)+(ptrdiff_t)0, 63>>cut12, wt282);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(5+16*c25)+(ptrdiff_t)0, 63>>cut12, wt283);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(6+16*c25)+(ptrdiff_t)0, 63>>cut12, wt284);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(7+16*c25)+(ptrdiff_t)0, 63>>cut12, wt285);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(8+16*c25)+(ptrdiff_t)0, 63>>cut12, wt286);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(9+16*c25)+(ptrdiff_t)0, 63>>cut12, wt287);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(10+16*c25)+(ptrdiff_t)0, 63>>cut12, wt288);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(11+16*c25)+(ptrdiff_t)0, 63>>cut12, wt289);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(12+16*c25)+(ptrdiff_t)0, 63>>cut12, wt290);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(13+16*c25)+(ptrdiff_t)0, 63>>cut12, wt291);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(14+16*c25)+(ptrdiff_t)0, 63>>cut12, wt292);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(15+16*c25)+(ptrdiff_t)0, 63>>cut12, wt293);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(16+16*c25)+(ptrdiff_t)0, 63>>cut12, wt294);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(1+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt279);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(2+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt280);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(3+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt281);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(4+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt282);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(5+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt283);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(6+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt284);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(7+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt285);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(8+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt286);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(9+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt287);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(10+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt288);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(11+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt289);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(12+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt290);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(13+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt291);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(14+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt292);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(15+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt293);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(16+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt294);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(1+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt279);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(2+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt280);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(3+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt281);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(4+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt282);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(5+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt283);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(6+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt284);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(7+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt285);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(8+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt286);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(9+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt287);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(10+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt288);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(11+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt289);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(12+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt290);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(13+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt291);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(14+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt292);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(15+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt293);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(16+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt294);
}
break;
}
default: {
cut12 = 4;
__m512 sum225 = _mm512_maskz_loadu_ps(65535, biasPtr9+1024*i34+4*k103);
__m512i pmMul18 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd18 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo15 = _mm512_loadu_ps(bnPtr10+(ptrdiff_t)8*(k103+256*i34));
__m512 masHi15 = _mm512_maskz_loadu_ps(65535, bnPtr10+(ptrdiff_t)8*(k103+256*i34)+(ptrdiff_t)64);
__m512 postMul29 = _mm512_permutex2var_ps(masLo15, pmMul18, masHi15);
__m512 postAdd19 = _mm512_permutex2var_ps(masLo15, pmAdd18, masHi15);
sum225 = _mm512_fmadd_ps(sum225, postMul29, postAdd19);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*0+(ptrdiff_t)0, 63>>cut12, sum225);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*0+(ptrdiff_t)6144, 4032>>cut12, sum225);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*0+(ptrdiff_t)12288, 258048>>cut12, sum225);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*0+(ptrdiff_t)18432, 65535-(262143>>cut12), sum225);
ptrdiff_t c26 = 0;
for (; c26 != 16; ++c26) {
__m512 wt295 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)0);
__m512 wt296 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)1024);
__m512 wt297 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)2048);
__m512 wt298 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)3072);
__m512 wt299 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)4096);
__m512 wt300 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)5120);
__m512 wt301 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)6144);
__m512 wt302 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)7168);
__m512 wt303 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)8192);
__m512 wt304 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)9216);
__m512 wt305 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)10240);
__m512 wt306 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)11264);
__m512 wt307 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)12288);
__m512 wt308 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)13312);
__m512 wt309 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)14336);
__m512 wt310 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)15360);
__m512 tmp10697 = _mm512_unpacklo_ps(wt295, wt296);
__m512 tmp10698 = _mm512_unpackhi_ps(wt295, wt296);
__m512 tmp10699 = _mm512_unpacklo_ps(wt297, wt298);
__m512 tmp10700 = _mm512_unpackhi_ps(wt297, wt298);
__m512 tmp10701 = _mm512_unpacklo_ps(wt299, wt300);
__m512 tmp10702 = _mm512_unpackhi_ps(wt299, wt300);
__m512 tmp10703 = _mm512_unpacklo_ps(wt301, wt302);
__m512 tmp10704 = _mm512_unpackhi_ps(wt301, wt302);
__m512 tmp10705 = _mm512_unpacklo_ps(wt303, wt304);
__m512 tmp10706 = _mm512_unpackhi_ps(wt303, wt304);
__m512 tmp10707 = _mm512_unpacklo_ps(wt305, wt306);
__m512 tmp10708 = _mm512_unpackhi_ps(wt305, wt306);
__m512 tmp10709 = _mm512_unpacklo_ps(wt307, wt308);
__m512 tmp10710 = _mm512_unpackhi_ps(wt307, wt308);
__m512 tmp10711 = _mm512_unpacklo_ps(wt309, wt310);
__m512 tmp10712 = _mm512_unpackhi_ps(wt309, wt310);
__m512 tmp10713 = _mm512_shuffle_ps(tmp10697, tmp10699, 68);
__m512 tmp10714 = _mm512_shuffle_ps(tmp10697, tmp10699, 238);
__m512 tmp10715 = _mm512_shuffle_ps(tmp10698, tmp10700, 68);
__m512 tmp10716 = _mm512_shuffle_ps(tmp10698, tmp10700, 238);
__m512 tmp10717 = _mm512_shuffle_ps(tmp10701, tmp10703, 68);
__m512 tmp10718 = _mm512_shuffle_ps(tmp10701, tmp10703, 238);
__m512 tmp10719 = _mm512_shuffle_ps(tmp10702, tmp10704, 68);
__m512 tmp10720 = _mm512_shuffle_ps(tmp10702, tmp10704, 238);
__m512 tmp10721 = _mm512_shuffle_ps(tmp10705, tmp10707, 68);
__m512 tmp10722 = _mm512_shuffle_ps(tmp10705, tmp10707, 238);
__m512 tmp10723 = _mm512_shuffle_ps(tmp10706, tmp10708, 68);
__m512 tmp10724 = _mm512_shuffle_ps(tmp10706, tmp10708, 238);
__m512 tmp10725 = _mm512_shuffle_ps(tmp10709, tmp10711, 68);
__m512 tmp10726 = _mm512_shuffle_ps(tmp10709, tmp10711, 238);
__m512 tmp10727 = _mm512_shuffle_ps(tmp10710, tmp10712, 68);
__m512 tmp10728 = _mm512_shuffle_ps(tmp10710, tmp10712, 238);
__m512 tmp10729 = _mm512_shuffle_f32x4(tmp10713, tmp10717, 136);
__m512 tmp10730 = _mm512_shuffle_f32x4(tmp10713, tmp10717, 221);
__m512 tmp10731 = _mm512_shuffle_f32x4(tmp10714, tmp10718, 136);
__m512 tmp10732 = _mm512_shuffle_f32x4(tmp10714, tmp10718, 221);
__m512 tmp10733 = _mm512_shuffle_f32x4(tmp10715, tmp10719, 136);
__m512 tmp10734 = _mm512_shuffle_f32x4(tmp10715, tmp10719, 221);
__m512 tmp10735 = _mm512_shuffle_f32x4(tmp10716, tmp10720, 136);
__m512 tmp10736 = _mm512_shuffle_f32x4(tmp10716, tmp10720, 221);
__m512 tmp10737 = _mm512_shuffle_f32x4(tmp10721, tmp10725, 136);
__m512 tmp10738 = _mm512_shuffle_f32x4(tmp10721, tmp10725, 221);
__m512 tmp10739 = _mm512_shuffle_f32x4(tmp10722, tmp10726, 136);
__m512 tmp10740 = _mm512_shuffle_f32x4(tmp10722, tmp10726, 221);
__m512 tmp10741 = _mm512_shuffle_f32x4(tmp10723, tmp10727, 136);
__m512 tmp10742 = _mm512_shuffle_f32x4(tmp10723, tmp10727, 221);
__m512 tmp10743 = _mm512_shuffle_f32x4(tmp10724, tmp10728, 136);
__m512 tmp10744 = _mm512_shuffle_f32x4(tmp10724, tmp10728, 221);
wt295 = _mm512_shuffle_f32x4(tmp10729, tmp10737, 136);
wt303 = _mm512_shuffle_f32x4(tmp10729, tmp10737, 221);
wt296 = _mm512_shuffle_f32x4(tmp10731, tmp10739, 136);
wt304 = _mm512_shuffle_f32x4(tmp10731, tmp10739, 221);
wt297 = _mm512_shuffle_f32x4(tmp10733, tmp10741, 136);
wt305 = _mm512_shuffle_f32x4(tmp10733, tmp10741, 221);
wt298 = _mm512_shuffle_f32x4(tmp10735, tmp10743, 136);
wt306 = _mm512_shuffle_f32x4(tmp10735, tmp10743, 221);
wt299 = _mm512_shuffle_f32x4(tmp10730, tmp10738, 136);
wt307 = _mm512_shuffle_f32x4(tmp10730, tmp10738, 221);
wt300 = _mm512_shuffle_f32x4(tmp10732, tmp10740, 136);
wt308 = _mm512_shuffle_f32x4(tmp10732, tmp10740, 221);
wt301 = _mm512_shuffle_f32x4(tmp10734, tmp10742, 136);
wt309 = _mm512_shuffle_f32x4(tmp10734, tmp10742, 221);
wt302 = _mm512_shuffle_f32x4(tmp10736, tmp10744, 136);
wt310 = _mm512_shuffle_f32x4(tmp10736, tmp10744, 221);
wt295 = _mm512_mul_ps(wt295, postMul29);
wt296 = _mm512_mul_ps(wt296, postMul29);
wt297 = _mm512_mul_ps(wt297, postMul29);
wt298 = _mm512_mul_ps(wt298, postMul29);
wt299 = _mm512_mul_ps(wt299, postMul29);
wt300 = _mm512_mul_ps(wt300, postMul29);
wt301 = _mm512_mul_ps(wt301, postMul29);
wt302 = _mm512_mul_ps(wt302, postMul29);
wt303 = _mm512_mul_ps(wt303, postMul29);
wt304 = _mm512_mul_ps(wt304, postMul29);
wt305 = _mm512_mul_ps(wt305, postMul29);
wt306 = _mm512_mul_ps(wt306, postMul29);
wt307 = _mm512_mul_ps(wt307, postMul29);
wt308 = _mm512_mul_ps(wt308, postMul29);
wt309 = _mm512_mul_ps(wt309, postMul29);
wt310 = _mm512_mul_ps(wt310, postMul29);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(1+16*c26)+(ptrdiff_t)0, 63>>cut12, wt295);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(2+16*c26)+(ptrdiff_t)0, 63>>cut12, wt296);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(3+16*c26)+(ptrdiff_t)0, 63>>cut12, wt297);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(4+16*c26)+(ptrdiff_t)0, 63>>cut12, wt298);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(5+16*c26)+(ptrdiff_t)0, 63>>cut12, wt299);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(6+16*c26)+(ptrdiff_t)0, 63>>cut12, wt300);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(7+16*c26)+(ptrdiff_t)0, 63>>cut12, wt301);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(8+16*c26)+(ptrdiff_t)0, 63>>cut12, wt302);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(9+16*c26)+(ptrdiff_t)0, 63>>cut12, wt303);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(10+16*c26)+(ptrdiff_t)0, 63>>cut12, wt304);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(11+16*c26)+(ptrdiff_t)0, 63>>cut12, wt305);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(12+16*c26)+(ptrdiff_t)0, 63>>cut12, wt306);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(13+16*c26)+(ptrdiff_t)0, 63>>cut12, wt307);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(14+16*c26)+(ptrdiff_t)0, 63>>cut12, wt308);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(15+16*c26)+(ptrdiff_t)0, 63>>cut12, wt309);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(16+16*c26)+(ptrdiff_t)0, 63>>cut12, wt310);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(1+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt295);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(2+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt296);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(3+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt297);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(4+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt298);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(5+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt299);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(6+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt300);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(7+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt301);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(8+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt302);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(9+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt303);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(10+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt304);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(11+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt305);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(12+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt306);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(13+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt307);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(14+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt308);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(15+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt309);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(16+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt310);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(1+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt295);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(2+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt296);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(3+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt297);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(4+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt298);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(5+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt299);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(6+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt300);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(7+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt301);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(8+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt302);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(9+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt303);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(10+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt304);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(11+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt305);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(12+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt306);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(13+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt307);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(14+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt308);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(15+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt309);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(16+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt310);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(1+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt295);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(2+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt296);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(3+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt297);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(4+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt298);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(5+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt299);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(6+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt300);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(7+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt301);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(8+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt302);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(9+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt303);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(10+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt304);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(11+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt305);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(12+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt306);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(13+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt307);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(14+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt308);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(15+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt309);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(16+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt310);
}
}
}
} else {
ptrdiff_t k102 = 240;
ptrdiff_t l39 = (size_t)(0+k102)/6;
ptrdiff_t cut11 = (size_t)(0+k102)%6;
__m512 sum223 = _mm512_maskz_loadu_ps(65535, biasPtr9+1024*i34+4*k102);
__m512i pmMul19 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd19 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo16 = _mm512_loadu_ps(bnPtr10+(ptrdiff_t)8*(k102+256*i34));
__m512 masHi16 = _mm512_maskz_loadu_ps(65535, bnPtr10+(ptrdiff_t)8*(k102+256*i34)+(ptrdiff_t)64);
__m512 postMul27 = _mm512_permutex2var_ps(masLo16, pmMul19, masHi16);
__m512 postAdd17 = _mm512_permutex2var_ps(masLo16, pmAdd19, masHi16);
sum223 = _mm512_fmadd_ps(sum223, postMul27, postAdd17);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*0+(ptrdiff_t)0, 63>>cut11, sum223);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*0+(ptrdiff_t)6144, 4032>>cut11, sum223);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*0+(ptrdiff_t)12288, 65535-(4095>>cut11), sum223);
ptrdiff_t c24 = 0;
for (; c24 != 16; ++c24) {
__m512 wt263 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)0);
__m512 wt264 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)1024);
__m512 wt265 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)2048);
__m512 wt266 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)3072);
__m512 wt267 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)4096);
__m512 wt268 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)5120);
__m512 wt269 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)6144);
__m512 wt270 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)7168);
__m512 wt271 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)8192);
__m512 wt272 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)9216);
__m512 wt273 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)10240);
__m512 wt274 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)11264);
__m512 wt275 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)12288);
__m512 wt276 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)13312);
__m512 wt277 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)14336);
__m512 wt278 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)15360);
__m512 tmp10745 = _mm512_unpacklo_ps(wt263, wt264);
__m512 tmp10746 = _mm512_unpackhi_ps(wt263, wt264);
__m512 tmp10747 = _mm512_unpacklo_ps(wt265, wt266);
__m512 tmp10748 = _mm512_unpackhi_ps(wt265, wt266);
__m512 tmp10749 = _mm512_unpacklo_ps(wt267, wt268);
__m512 tmp10750 = _mm512_unpackhi_ps(wt267, wt268);
__m512 tmp10751 = _mm512_unpacklo_ps(wt269, wt270);
__m512 tmp10752 = _mm512_unpackhi_ps(wt269, wt270);
__m512 tmp10753 = _mm512_unpacklo_ps(wt271, wt272);
__m512 tmp10754 = _mm512_unpackhi_ps(wt271, wt272);
__m512 tmp10755 = _mm512_unpacklo_ps(wt273, wt274);
__m512 tmp10756 = _mm512_unpackhi_ps(wt273, wt274);
__m512 tmp10757 = _mm512_unpacklo_ps(wt275, wt276);
__m512 tmp10758 = _mm512_unpackhi_ps(wt275, wt276);
__m512 tmp10759 = _mm512_unpacklo_ps(wt277, wt278);
__m512 tmp10760 = _mm512_unpackhi_ps(wt277, wt278);
__m512 tmp10761 = _mm512_shuffle_ps(tmp10745, tmp10747, 68);
__m512 tmp10762 = _mm512_shuffle_ps(tmp10745, tmp10747, 238);
__m512 tmp10763 = _mm512_shuffle_ps(tmp10746, tmp10748, 68);
__m512 tmp10764 = _mm512_shuffle_ps(tmp10746, tmp10748, 238);
__m512 tmp10765 = _mm512_shuffle_ps(tmp10749, tmp10751, 68);
__m512 tmp10766 = _mm512_shuffle_ps(tmp10749, tmp10751, 238);
__m512 tmp10767 = _mm512_shuffle_ps(tmp10750, tmp10752, 68);
__m512 tmp10768 = _mm512_shuffle_ps(tmp10750, tmp10752, 238);
__m512 tmp10769 = _mm512_shuffle_ps(tmp10753, tmp10755, 68);
__m512 tmp10770 = _mm512_shuffle_ps(tmp10753, tmp10755, 238);
__m512 tmp10771 = _mm512_shuffle_ps(tmp10754, tmp10756, 68);
__m512 tmp10772 = _mm512_shuffle_ps(tmp10754, tmp10756, 238);
__m512 tmp10773 = _mm512_shuffle_ps(tmp10757, tmp10759, 68);
__m512 tmp10774 = _mm512_shuffle_ps(tmp10757, tmp10759, 238);
__m512 tmp10775 = _mm512_shuffle_ps(tmp10758, tmp10760, 68);
__m512 tmp10776 = _mm512_shuffle_ps(tmp10758, tmp10760, 238);
__m512 tmp10777 = _mm512_shuffle_f32x4(tmp10761, tmp10765, 136);
__m512 tmp10778 = _mm512_shuffle_f32x4(tmp10761, tmp10765, 221);
__m512 tmp10779 = _mm512_shuffle_f32x4(tmp10762, tmp10766, 136);
__m512 tmp10780 = _mm512_shuffle_f32x4(tmp10762, tmp10766, 221);
__m512 tmp10781 = _mm512_shuffle_f32x4(tmp10763, tmp10767, 136);
__m512 tmp10782 = _mm512_shuffle_f32x4(tmp10763, tmp10767, 221);
__m512 tmp10783 = _mm512_shuffle_f32x4(tmp10764, tmp10768, 136);
__m512 tmp10784 = _mm512_shuffle_f32x4(tmp10764, tmp10768, 221);
__m512 tmp10785 = _mm512_shuffle_f32x4(tmp10769, tmp10773, 136);
__m512 tmp10786 = _mm512_shuffle_f32x4(tmp10769, tmp10773, 221);
__m512 tmp10787 = _mm512_shuffle_f32x4(tmp10770, tmp10774, 136);
__m512 tmp10788 = _mm512_shuffle_f32x4(tmp10770, tmp10774, 221);
__m512 tmp10789 = _mm512_shuffle_f32x4(tmp10771, tmp10775, 136);
__m512 tmp10790 = _mm512_shuffle_f32x4(tmp10771, tmp10775, 221);
__m512 tmp10791 = _mm512_shuffle_f32x4(tmp10772, tmp10776, 136);
__m512 tmp10792 = _mm512_shuffle_f32x4(tmp10772, tmp10776, 221);
wt263 = _mm512_shuffle_f32x4(tmp10777, tmp10785, 136);
wt271 = _mm512_shuffle_f32x4(tmp10777, tmp10785, 221);
wt264 = _mm512_shuffle_f32x4(tmp10779, tmp10787, 136);
wt272 = _mm512_shuffle_f32x4(tmp10779, tmp10787, 221);
wt265 = _mm512_shuffle_f32x4(tmp10781, tmp10789, 136);
wt273 = _mm512_shuffle_f32x4(tmp10781, tmp10789, 221);
wt266 = _mm512_shuffle_f32x4(tmp10783, tmp10791, 136);
wt274 = _mm512_shuffle_f32x4(tmp10783, tmp10791, 221);
wt267 = _mm512_shuffle_f32x4(tmp10778, tmp10786, 136);
wt275 = _mm512_shuffle_f32x4(tmp10778, tmp10786, 221);
wt268 = _mm512_shuffle_f32x4(tmp10780, tmp10788, 136);
wt276 = _mm512_shuffle_f32x4(tmp10780, tmp10788, 221);
wt269 = _mm512_shuffle_f32x4(tmp10782, tmp10790, 136);
wt277 = _mm512_shuffle_f32x4(tmp10782, tmp10790, 221);
wt270 = _mm512_shuffle_f32x4(tmp10784, tmp10792, 136);
wt278 = _mm512_shuffle_f32x4(tmp10784, tmp10792, 221);
wt263 = _mm512_mul_ps(wt263, postMul27);
wt264 = _mm512_mul_ps(wt264, postMul27);
wt265 = _mm512_mul_ps(wt265, postMul27);
wt266 = _mm512_mul_ps(wt266, postMul27);
wt267 = _mm512_mul_ps(wt267, postMul27);
wt268 = _mm512_mul_ps(wt268, postMul27);
wt269 = _mm512_mul_ps(wt269, postMul27);
wt270 = _mm512_mul_ps(wt270, postMul27);
wt271 = _mm512_mul_ps(wt271, postMul27);
wt272 = _mm512_mul_ps(wt272, postMul27);
wt273 = _mm512_mul_ps(wt273, postMul27);
wt274 = _mm512_mul_ps(wt274, postMul27);
wt275 = _mm512_mul_ps(wt275, postMul27);
wt276 = _mm512_mul_ps(wt276, postMul27);
wt277 = _mm512_mul_ps(wt277, postMul27);
wt278 = _mm512_mul_ps(wt278, postMul27);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(1+16*c24)+(ptrdiff_t)0, 63>>cut11, wt263);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(2+16*c24)+(ptrdiff_t)0, 63>>cut11, wt264);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(3+16*c24)+(ptrdiff_t)0, 63>>cut11, wt265);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(4+16*c24)+(ptrdiff_t)0, 63>>cut11, wt266);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(5+16*c24)+(ptrdiff_t)0, 63>>cut11, wt267);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(6+16*c24)+(ptrdiff_t)0, 63>>cut11, wt268);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(7+16*c24)+(ptrdiff_t)0, 63>>cut11, wt269);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(8+16*c24)+(ptrdiff_t)0, 63>>cut11, wt270);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(9+16*c24)+(ptrdiff_t)0, 63>>cut11, wt271);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(10+16*c24)+(ptrdiff_t)0, 63>>cut11, wt272);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(11+16*c24)+(ptrdiff_t)0, 63>>cut11, wt273);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(12+16*c24)+(ptrdiff_t)0, 63>>cut11, wt274);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(13+16*c24)+(ptrdiff_t)0, 63>>cut11, wt275);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(14+16*c24)+(ptrdiff_t)0, 63>>cut11, wt276);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(15+16*c24)+(ptrdiff_t)0, 63>>cut11, wt277);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(16+16*c24)+(ptrdiff_t)0, 63>>cut11, wt278);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(1+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt263);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(2+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt264);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(3+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt265);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(4+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt266);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(5+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt267);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(6+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt268);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(7+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt269);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(8+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt270);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(9+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt271);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(10+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt272);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(11+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt273);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(12+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt274);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(13+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt275);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(14+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt276);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(15+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt277);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(16+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt278);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(1+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt263);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(2+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt264);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(3+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt265);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(4+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt266);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(5+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt267);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(6+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt268);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(7+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt269);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(8+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt270);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(9+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt271);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(10+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt272);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(11+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt273);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(12+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt274);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(13+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt275);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(14+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt276);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(15+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt277);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(16+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt278);
}
}
}
}
}

static void ResNeXt50OneArrangeWts5(ResNeXt50ThreaderTeam1* team39, char** tensors51) {
ResNeXt50ThreaderTask1 task55;
task55.callee1 = ResNeXt50OneArrangeWts5Callee1;
task55.any1 = tensors51;
task55.nd1 = 3;
task55.hull1[0] = 8;
task55.hull1[1] = 1;
task55.hull1[2] = 1;
ResNeXt50ThreaderDo1(team39, &task55);
}

static void ResNeXt50OneArrangeDats5Callee1(ResNeXt50ThreaderTask1* task56, int64_t* pt33) {
char** tensors54 = task56->any1;
ptrdiff_t s23 = pt33[0];
ptrdiff_t c27 = pt33[1];
char*restrict datPtr16 = tensors54[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)3227648*0;
char*restrict arranged10 = tensors54[1]+(ptrdiff_t)10474240*0+(ptrdiff_t)3211264*0;
ptrdiff_t ii22 = 1;
for (ptrdiff_t i35 = 0; i35 < ii22; ++i35) {
ptrdiff_t j28 = 1*c27;
ptrdiff_t jj34 = j28+0;
for (; j28 != 49; ++j28) {
ptrdiff_t k104 = 128*s23;
ptrdiff_t kk32 = k104+128;
for (; k104 < kk32; ++k104) {
__m512 dat1677 = _mm512_maskz_loadu_ps(65535, datPtr16+3227648*i35+256*j28+12608*k104+(ptrdiff_t)0);
__m512 dat1678 = _mm512_maskz_loadu_ps(65535, datPtr16+3227648*i35+256*j28+12608*k104+(ptrdiff_t)64);
__m512 dat1679 = _mm512_maskz_loadu_ps(65535, datPtr16+3227648*i35+256*j28+12608*k104+(ptrdiff_t)128);
__m512 dat1680 = _mm512_maskz_loadu_ps(65535, datPtr16+3227648*i35+256*j28+12608*k104+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged10+3211264*i35+65536*j28+256*k104+(ptrdiff_t)0, 65535, dat1677);
_mm512_mask_storeu_ps(arranged10+3211264*i35+65536*j28+256*k104+(ptrdiff_t)64, 65535, dat1678);
_mm512_mask_storeu_ps(arranged10+3211264*i35+65536*j28+256*k104+(ptrdiff_t)128, 65535, dat1679);
_mm512_mask_storeu_ps(arranged10+3211264*i35+65536*j28+256*k104+(ptrdiff_t)192, 65535, dat1680);
}
if (j28 >= jj34) goto next5;
}
next5:;
}
}

static void ResNeXt50OneArrangeDats5(ResNeXt50ThreaderTeam1* team40, char** tensors53) {
ResNeXt50ThreaderTask1 task57;
task57.callee1 = ResNeXt50OneArrangeDats5Callee1;
task57.any1 = tensors53;
task57.nd1 = 4;
task57.hull1[0] = 2;
task57.hull1[1] = 49;
task57.hull1[2] = 1;
task57.hull1[3] = 1;
ResNeXt50ThreaderDo1(team40, &task57);
}

static void ResNeXt50OneApply5Callee1(ResNeXt50ThreaderTask1* task58, int64_t* pt34) {
void** pair14 = task58->any1;
char** tensors56 = pair14[0];
ptrdiff_t e16 = 0;
ptrdiff_t g18 = 0;
ptrdiff_t d11 = pt34[1];
ptrdiff_t w48 = pt34[0];
char*restrict arrangedWts5 = tensors56[0]+856064*e16+(ptrdiff_t)263168*1*g18;
char*restrict arrangedDats5 = tensors56[1]+10474240*e16+(ptrdiff_t)3211264*1*g18;
char*restrict datPtr17 = tensors56[2]+(ptrdiff_t)3227648*1*g18;
ptrdiff_t ii23 = 1;
for (ptrdiff_t i36 = 0; i36 < ii23; ++i36) {
ptrdiff_t j29 = 1*d11;
ptrdiff_t jj35 = j29+0;
for (; j29 != 49; ++j29) {
ptrdiff_t k105 = 2*w48;
ptrdiff_t kk33 = k105+(w48 < 20 ? 1 : 2);
for (; k105 != 42; ++k105) {
ptrdiff_t s24 = -1;
__m512 sum226 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)24));
__m512 sum230 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)28));
__m512 sum234 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)32));
__m512 sum238 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)36));
__m512 sum242 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)40));
__m512 sum246 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)44));
__m512 sum227 = sum226;
__m512 sum228 = sum226;
__m512 sum229 = sum226;
__m512 sum231 = sum230;
__m512 sum232 = sum230;
__m512 sum233 = sum230;
__m512 sum235 = sum234;
__m512 sum236 = sum234;
__m512 sum237 = sum234;
__m512 sum239 = sum238;
__m512 sum240 = sum238;
__m512 sum241 = sum238;
__m512 sum243 = sum242;
__m512 sum244 = sum242;
__m512 sum245 = sum242;
__m512 sum247 = sum246;
__m512 sum248 = sum246;
__m512 sum249 = sum246;
for (s24 = 0; s24 < 256; ++s24) {
__m512 dat1681 = _mm512_loadu_ps(arrangedDats5+3211264*i36+65536*j29+256*s24+(ptrdiff_t)0);
__m512 dat1682 = _mm512_loadu_ps(arrangedDats5+3211264*i36+65536*j29+256*s24+(ptrdiff_t)64);
__m512 dat1683 = _mm512_loadu_ps(arrangedDats5+3211264*i36+65536*j29+256*s24+(ptrdiff_t)128);
__m512 dat1684 = _mm512_loadu_ps(arrangedDats5+3211264*i36+65536*j29+256*s24+(ptrdiff_t)192);
__m512 wt311 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)24));
sum226 = _mm512_fmadd_ps(wt311, dat1681, sum226);
sum227 = _mm512_fmadd_ps(wt311, dat1682, sum227);
sum228 = _mm512_fmadd_ps(wt311, dat1683, sum228);
sum229 = _mm512_fmadd_ps(wt311, dat1684, sum229);
__m512 wt312 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)28));
sum230 = _mm512_fmadd_ps(wt312, dat1681, sum230);
sum231 = _mm512_fmadd_ps(wt312, dat1682, sum231);
sum232 = _mm512_fmadd_ps(wt312, dat1683, sum232);
sum233 = _mm512_fmadd_ps(wt312, dat1684, sum233);
__m512 wt313 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)32));
sum234 = _mm512_fmadd_ps(wt313, dat1681, sum234);
sum235 = _mm512_fmadd_ps(wt313, dat1682, sum235);
sum236 = _mm512_fmadd_ps(wt313, dat1683, sum236);
sum237 = _mm512_fmadd_ps(wt313, dat1684, sum237);
__m512 wt314 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)36));
sum238 = _mm512_fmadd_ps(wt314, dat1681, sum238);
sum239 = _mm512_fmadd_ps(wt314, dat1682, sum239);
sum240 = _mm512_fmadd_ps(wt314, dat1683, sum240);
sum241 = _mm512_fmadd_ps(wt314, dat1684, sum241);
__m512 wt315 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)40));
sum242 = _mm512_fmadd_ps(wt315, dat1681, sum242);
sum243 = _mm512_fmadd_ps(wt315, dat1682, sum243);
sum244 = _mm512_fmadd_ps(wt315, dat1683, sum244);
sum245 = _mm512_fmadd_ps(wt315, dat1684, sum245);
__m512 wt316 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)44));
sum246 = _mm512_fmadd_ps(wt316, dat1681, sum246);
sum247 = _mm512_fmadd_ps(wt316, dat1682, sum247);
sum248 = _mm512_fmadd_ps(wt316, dat1683, sum248);
sum249 = _mm512_fmadd_ps(wt316, dat1684, sum249);
}
sum226 = _mm512_max_ps(_mm512_setzero_ps(), sum226);
sum227 = _mm512_max_ps(_mm512_setzero_ps(), sum227);
sum228 = _mm512_max_ps(_mm512_setzero_ps(), sum228);
sum229 = _mm512_max_ps(_mm512_setzero_ps(), sum229);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)0, 65535, sum226);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)64, 65535, sum227);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)128, 65535, sum228);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)192, 65535, sum229);
sum230 = _mm512_max_ps(_mm512_setzero_ps(), sum230);
sum231 = _mm512_max_ps(_mm512_setzero_ps(), sum231);
sum232 = _mm512_max_ps(_mm512_setzero_ps(), sum232);
sum233 = _mm512_max_ps(_mm512_setzero_ps(), sum233);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)12608, 65535, sum230);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)12672, 65535, sum231);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)12736, 65535, sum232);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)12800, 65535, sum233);
sum234 = _mm512_max_ps(_mm512_setzero_ps(), sum234);
sum235 = _mm512_max_ps(_mm512_setzero_ps(), sum235);
sum236 = _mm512_max_ps(_mm512_setzero_ps(), sum236);
sum237 = _mm512_max_ps(_mm512_setzero_ps(), sum237);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)25216, 65535, sum234);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)25280, 65535, sum235);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)25344, 65535, sum236);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)25408, 65535, sum237);
sum238 = _mm512_max_ps(_mm512_setzero_ps(), sum238);
sum239 = _mm512_max_ps(_mm512_setzero_ps(), sum239);
sum240 = _mm512_max_ps(_mm512_setzero_ps(), sum240);
sum241 = _mm512_max_ps(_mm512_setzero_ps(), sum241);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)37824, 65535, sum238);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)37888, 65535, sum239);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)37952, 65535, sum240);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)38016, 65535, sum241);
sum242 = _mm512_max_ps(_mm512_setzero_ps(), sum242);
sum243 = _mm512_max_ps(_mm512_setzero_ps(), sum243);
sum244 = _mm512_max_ps(_mm512_setzero_ps(), sum244);
sum245 = _mm512_max_ps(_mm512_setzero_ps(), sum245);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)50432, 65535, sum242);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)50496, 65535, sum243);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)50560, 65535, sum244);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)50624, 65535, sum245);
sum246 = _mm512_max_ps(_mm512_setzero_ps(), sum246);
sum247 = _mm512_max_ps(_mm512_setzero_ps(), sum247);
sum248 = _mm512_max_ps(_mm512_setzero_ps(), sum248);
sum249 = _mm512_max_ps(_mm512_setzero_ps(), sum249);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)63040, 65535, sum246);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)63104, 65535, sum247);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)63168, 65535, sum248);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)63232, 65535, sum249);
if (k105 >= kk33) return;
}
ptrdiff_t s25 = -1;
__m512 sum250 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+16*s25+(ptrdiff_t)16));
__m512 sum254 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+16*s25+(ptrdiff_t)20));
__m512 sum258 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+16*s25+(ptrdiff_t)24));
__m512 sum262 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+16*s25+(ptrdiff_t)28));
__m512 sum251 = sum250;
__m512 sum252 = sum250;
__m512 sum253 = sum250;
__m512 sum255 = sum254;
__m512 sum256 = sum254;
__m512 sum257 = sum254;
__m512 sum259 = sum258;
__m512 sum260 = sum258;
__m512 sum261 = sum258;
__m512 sum263 = sum262;
__m512 sum264 = sum262;
__m512 sum265 = sum262;
for (s25 = 0; s25 < 256; ++s25) {
__m512 dat1685 = _mm512_loadu_ps(arrangedDats5+3211264*i36+65536*j29+256*s25+(ptrdiff_t)0);
__m512 dat1686 = _mm512_loadu_ps(arrangedDats5+3211264*i36+65536*j29+256*s25+(ptrdiff_t)64);
__m512 dat1687 = _mm512_loadu_ps(arrangedDats5+3211264*i36+65536*j29+256*s25+(ptrdiff_t)128);
__m512 dat1688 = _mm512_loadu_ps(arrangedDats5+3211264*i36+65536*j29+256*s25+(ptrdiff_t)192);
__m512 wt317 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+16*s25+(ptrdiff_t)16));
sum250 = _mm512_fmadd_ps(wt317, dat1685, sum250);
sum251 = _mm512_fmadd_ps(wt317, dat1686, sum251);
sum252 = _mm512_fmadd_ps(wt317, dat1687, sum252);
sum253 = _mm512_fmadd_ps(wt317, dat1688, sum253);
__m512 wt318 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+16*s25+(ptrdiff_t)20));
sum254 = _mm512_fmadd_ps(wt318, dat1685, sum254);
sum255 = _mm512_fmadd_ps(wt318, dat1686, sum255);
sum256 = _mm512_fmadd_ps(wt318, dat1687, sum256);
sum257 = _mm512_fmadd_ps(wt318, dat1688, sum257);
__m512 wt319 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+16*s25+(ptrdiff_t)24));
sum258 = _mm512_fmadd_ps(wt319, dat1685, sum258);
sum259 = _mm512_fmadd_ps(wt319, dat1686, sum259);
sum260 = _mm512_fmadd_ps(wt319, dat1687, sum260);
sum261 = _mm512_fmadd_ps(wt319, dat1688, sum261);
__m512 wt320 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+16*s25+(ptrdiff_t)28));
sum262 = _mm512_fmadd_ps(wt320, dat1685, sum262);
sum263 = _mm512_fmadd_ps(wt320, dat1686, sum263);
sum264 = _mm512_fmadd_ps(wt320, dat1687, sum264);
sum265 = _mm512_fmadd_ps(wt320, dat1688, sum265);
}
sum250 = _mm512_max_ps(_mm512_setzero_ps(), sum250);
sum251 = _mm512_max_ps(_mm512_setzero_ps(), sum251);
sum252 = _mm512_max_ps(_mm512_setzero_ps(), sum252);
sum253 = _mm512_max_ps(_mm512_setzero_ps(), sum253);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)0, 65535, sum250);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)64, 65535, sum251);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)128, 65535, sum252);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)192, 65535, sum253);
sum254 = _mm512_max_ps(_mm512_setzero_ps(), sum254);
sum255 = _mm512_max_ps(_mm512_setzero_ps(), sum255);
sum256 = _mm512_max_ps(_mm512_setzero_ps(), sum256);
sum257 = _mm512_max_ps(_mm512_setzero_ps(), sum257);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)12608, 65535, sum254);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)12672, 65535, sum255);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)12736, 65535, sum256);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)12800, 65535, sum257);
sum258 = _mm512_max_ps(_mm512_setzero_ps(), sum258);
sum259 = _mm512_max_ps(_mm512_setzero_ps(), sum259);
sum260 = _mm512_max_ps(_mm512_setzero_ps(), sum260);
sum261 = _mm512_max_ps(_mm512_setzero_ps(), sum261);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)25216, 65535, sum258);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)25280, 65535, sum259);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)25344, 65535, sum260);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)25408, 65535, sum261);
sum262 = _mm512_max_ps(_mm512_setzero_ps(), sum262);
sum263 = _mm512_max_ps(_mm512_setzero_ps(), sum263);
sum264 = _mm512_max_ps(_mm512_setzero_ps(), sum264);
sum265 = _mm512_max_ps(_mm512_setzero_ps(), sum265);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)37824, 65535, sum262);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)37888, 65535, sum263);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)37952, 65535, sum264);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)38016, 65535, sum265);
if (j29 >= jj35) return;
}
}
}

static void ResNeXt50OneApply5(ResNeXt50ThreaderTeam1* team41, char** tensors55) {
void* pair13[] = {tensors55, 0};
ResNeXt50ThreaderTask1 task59;
task59.callee1 = ResNeXt50OneApply5Callee1;
task59.any1 = pair13;
task59.nd1 = 3;
task59.hull1[0] = 21;
task59.hull1[1] = 49;
task59.hull1[2] = 1;
ResNeXt50ThreaderDo1(team41, &task59);
}

static void ResNeXt50OneArrangeWts6Callee1(ResNeXt50ThreaderTask1* task68, int64_t* pt39) {
char** tensors66 = task68->any1;
ptrdiff_t b69 = pt39[0];
char*restrict wtPtr11 = tensors66[0]+(ptrdiff_t)3340*0+(ptrdiff_t)524288*0;
char*restrict biasPtr11 = tensors66[1]+(ptrdiff_t)2048*0;
char*restrict bnPtr12 = tensors66[2]+(ptrdiff_t)8*512*0;
char*restrict arranged11 = tensors66[3]+(ptrdiff_t)1712128*0+(ptrdiff_t)526336*0;
ptrdiff_t ii28 = 1;
for (ptrdiff_t i42 = 0; i42 < ii28; ++i42) {
ptrdiff_t j35 = 2*b69;
ptrdiff_t jj38 = j35+2;
for (; j35 < jj38; ++j35) {
if (j35 < 31) {
ptrdiff_t k118 = 0+16*(j35-0);
ptrdiff_t l50 = (size_t)(0+k118)/6;
ptrdiff_t cut14 = (size_t)(0+k118)%6;
switch (cut14) {
case 0:;
case 2: {
__m512 sum267 = _mm512_maskz_loadu_ps(65535, biasPtr11+2048*i42+4*k118);
__m512i pmMul21 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd21 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo17 = _mm512_loadu_ps(bnPtr12+(ptrdiff_t)8*(k118+512*i42));
__m512 masHi17 = _mm512_maskz_loadu_ps(65535, bnPtr12+(ptrdiff_t)8*(k118+512*i42)+(ptrdiff_t)64);
__m512 postMul34 = _mm512_permutex2var_ps(masLo17, pmMul21, masHi17);
__m512 postAdd22 = _mm512_permutex2var_ps(masLo17, pmAdd21, masHi17);
sum267 = _mm512_fmadd_ps(sum267, postMul34, postAdd22);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*0+(ptrdiff_t)0, 63>>cut14, sum267);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*0+(ptrdiff_t)6144, 4032>>cut14, sum267);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*0+(ptrdiff_t)12288, 65535-(4095>>cut14), sum267);
ptrdiff_t c32 = 0;
for (; c32 != 16; ++c32) {
__m512 wt343 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)0);
__m512 wt344 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)1024);
__m512 wt345 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)2048);
__m512 wt346 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)3072);
__m512 wt347 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)4096);
__m512 wt348 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)5120);
__m512 wt349 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)6144);
__m512 wt350 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)7168);
__m512 wt351 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)8192);
__m512 wt352 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)9216);
__m512 wt353 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)10240);
__m512 wt354 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)11264);
__m512 wt355 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)12288);
__m512 wt356 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)13312);
__m512 wt357 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)14336);
__m512 wt358 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)15360);
__m512 tmp10793 = _mm512_unpacklo_ps(wt343, wt344);
__m512 tmp10794 = _mm512_unpackhi_ps(wt343, wt344);
__m512 tmp10795 = _mm512_unpacklo_ps(wt345, wt346);
__m512 tmp10796 = _mm512_unpackhi_ps(wt345, wt346);
__m512 tmp10797 = _mm512_unpacklo_ps(wt347, wt348);
__m512 tmp10798 = _mm512_unpackhi_ps(wt347, wt348);
__m512 tmp10799 = _mm512_unpacklo_ps(wt349, wt350);
__m512 tmp10800 = _mm512_unpackhi_ps(wt349, wt350);
__m512 tmp10801 = _mm512_unpacklo_ps(wt351, wt352);
__m512 tmp10802 = _mm512_unpackhi_ps(wt351, wt352);
__m512 tmp10803 = _mm512_unpacklo_ps(wt353, wt354);
__m512 tmp10804 = _mm512_unpackhi_ps(wt353, wt354);
__m512 tmp10805 = _mm512_unpacklo_ps(wt355, wt356);
__m512 tmp10806 = _mm512_unpackhi_ps(wt355, wt356);
__m512 tmp10807 = _mm512_unpacklo_ps(wt357, wt358);
__m512 tmp10808 = _mm512_unpackhi_ps(wt357, wt358);
__m512 tmp10809 = _mm512_shuffle_ps(tmp10793, tmp10795, 68);
__m512 tmp10810 = _mm512_shuffle_ps(tmp10793, tmp10795, 238);
__m512 tmp10811 = _mm512_shuffle_ps(tmp10794, tmp10796, 68);
__m512 tmp10812 = _mm512_shuffle_ps(tmp10794, tmp10796, 238);
__m512 tmp10813 = _mm512_shuffle_ps(tmp10797, tmp10799, 68);
__m512 tmp10814 = _mm512_shuffle_ps(tmp10797, tmp10799, 238);
__m512 tmp10815 = _mm512_shuffle_ps(tmp10798, tmp10800, 68);
__m512 tmp10816 = _mm512_shuffle_ps(tmp10798, tmp10800, 238);
__m512 tmp10817 = _mm512_shuffle_ps(tmp10801, tmp10803, 68);
__m512 tmp10818 = _mm512_shuffle_ps(tmp10801, tmp10803, 238);
__m512 tmp10819 = _mm512_shuffle_ps(tmp10802, tmp10804, 68);
__m512 tmp10820 = _mm512_shuffle_ps(tmp10802, tmp10804, 238);
__m512 tmp10821 = _mm512_shuffle_ps(tmp10805, tmp10807, 68);
__m512 tmp10822 = _mm512_shuffle_ps(tmp10805, tmp10807, 238);
__m512 tmp10823 = _mm512_shuffle_ps(tmp10806, tmp10808, 68);
__m512 tmp10824 = _mm512_shuffle_ps(tmp10806, tmp10808, 238);
__m512 tmp10825 = _mm512_shuffle_f32x4(tmp10809, tmp10813, 136);
__m512 tmp10826 = _mm512_shuffle_f32x4(tmp10809, tmp10813, 221);
__m512 tmp10827 = _mm512_shuffle_f32x4(tmp10810, tmp10814, 136);
__m512 tmp10828 = _mm512_shuffle_f32x4(tmp10810, tmp10814, 221);
__m512 tmp10829 = _mm512_shuffle_f32x4(tmp10811, tmp10815, 136);
__m512 tmp10830 = _mm512_shuffle_f32x4(tmp10811, tmp10815, 221);
__m512 tmp10831 = _mm512_shuffle_f32x4(tmp10812, tmp10816, 136);
__m512 tmp10832 = _mm512_shuffle_f32x4(tmp10812, tmp10816, 221);
__m512 tmp10833 = _mm512_shuffle_f32x4(tmp10817, tmp10821, 136);
__m512 tmp10834 = _mm512_shuffle_f32x4(tmp10817, tmp10821, 221);
__m512 tmp10835 = _mm512_shuffle_f32x4(tmp10818, tmp10822, 136);
__m512 tmp10836 = _mm512_shuffle_f32x4(tmp10818, tmp10822, 221);
__m512 tmp10837 = _mm512_shuffle_f32x4(tmp10819, tmp10823, 136);
__m512 tmp10838 = _mm512_shuffle_f32x4(tmp10819, tmp10823, 221);
__m512 tmp10839 = _mm512_shuffle_f32x4(tmp10820, tmp10824, 136);
__m512 tmp10840 = _mm512_shuffle_f32x4(tmp10820, tmp10824, 221);
wt343 = _mm512_shuffle_f32x4(tmp10825, tmp10833, 136);
wt351 = _mm512_shuffle_f32x4(tmp10825, tmp10833, 221);
wt344 = _mm512_shuffle_f32x4(tmp10827, tmp10835, 136);
wt352 = _mm512_shuffle_f32x4(tmp10827, tmp10835, 221);
wt345 = _mm512_shuffle_f32x4(tmp10829, tmp10837, 136);
wt353 = _mm512_shuffle_f32x4(tmp10829, tmp10837, 221);
wt346 = _mm512_shuffle_f32x4(tmp10831, tmp10839, 136);
wt354 = _mm512_shuffle_f32x4(tmp10831, tmp10839, 221);
wt347 = _mm512_shuffle_f32x4(tmp10826, tmp10834, 136);
wt355 = _mm512_shuffle_f32x4(tmp10826, tmp10834, 221);
wt348 = _mm512_shuffle_f32x4(tmp10828, tmp10836, 136);
wt356 = _mm512_shuffle_f32x4(tmp10828, tmp10836, 221);
wt349 = _mm512_shuffle_f32x4(tmp10830, tmp10838, 136);
wt357 = _mm512_shuffle_f32x4(tmp10830, tmp10838, 221);
wt350 = _mm512_shuffle_f32x4(tmp10832, tmp10840, 136);
wt358 = _mm512_shuffle_f32x4(tmp10832, tmp10840, 221);
wt343 = _mm512_mul_ps(wt343, postMul34);
wt344 = _mm512_mul_ps(wt344, postMul34);
wt345 = _mm512_mul_ps(wt345, postMul34);
wt346 = _mm512_mul_ps(wt346, postMul34);
wt347 = _mm512_mul_ps(wt347, postMul34);
wt348 = _mm512_mul_ps(wt348, postMul34);
wt349 = _mm512_mul_ps(wt349, postMul34);
wt350 = _mm512_mul_ps(wt350, postMul34);
wt351 = _mm512_mul_ps(wt351, postMul34);
wt352 = _mm512_mul_ps(wt352, postMul34);
wt353 = _mm512_mul_ps(wt353, postMul34);
wt354 = _mm512_mul_ps(wt354, postMul34);
wt355 = _mm512_mul_ps(wt355, postMul34);
wt356 = _mm512_mul_ps(wt356, postMul34);
wt357 = _mm512_mul_ps(wt357, postMul34);
wt358 = _mm512_mul_ps(wt358, postMul34);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(1+16*c32)+(ptrdiff_t)0, 63>>cut14, wt343);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(2+16*c32)+(ptrdiff_t)0, 63>>cut14, wt344);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(3+16*c32)+(ptrdiff_t)0, 63>>cut14, wt345);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(4+16*c32)+(ptrdiff_t)0, 63>>cut14, wt346);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(5+16*c32)+(ptrdiff_t)0, 63>>cut14, wt347);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(6+16*c32)+(ptrdiff_t)0, 63>>cut14, wt348);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(7+16*c32)+(ptrdiff_t)0, 63>>cut14, wt349);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(8+16*c32)+(ptrdiff_t)0, 63>>cut14, wt350);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(9+16*c32)+(ptrdiff_t)0, 63>>cut14, wt351);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(10+16*c32)+(ptrdiff_t)0, 63>>cut14, wt352);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(11+16*c32)+(ptrdiff_t)0, 63>>cut14, wt353);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(12+16*c32)+(ptrdiff_t)0, 63>>cut14, wt354);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(13+16*c32)+(ptrdiff_t)0, 63>>cut14, wt355);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(14+16*c32)+(ptrdiff_t)0, 63>>cut14, wt356);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(15+16*c32)+(ptrdiff_t)0, 63>>cut14, wt357);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(16+16*c32)+(ptrdiff_t)0, 63>>cut14, wt358);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(1+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt343);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(2+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt344);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(3+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt345);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(4+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt346);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(5+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt347);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(6+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt348);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(7+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt349);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(8+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt350);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(9+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt351);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(10+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt352);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(11+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt353);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(12+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt354);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(13+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt355);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(14+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt356);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(15+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt357);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(16+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt358);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(1+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt343);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(2+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt344);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(3+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt345);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(4+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt346);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(5+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt347);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(6+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt348);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(7+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt349);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(8+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt350);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(9+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt351);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(10+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt352);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(11+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt353);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(12+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt354);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(13+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt355);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(14+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt356);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(15+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt357);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(16+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt358);
}
break;
}
default: {
cut14 = 4;
__m512 sum268 = _mm512_maskz_loadu_ps(65535, biasPtr11+2048*i42+4*k118);
__m512i pmMul22 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd22 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo18 = _mm512_loadu_ps(bnPtr12+(ptrdiff_t)8*(k118+512*i42));
__m512 masHi18 = _mm512_maskz_loadu_ps(65535, bnPtr12+(ptrdiff_t)8*(k118+512*i42)+(ptrdiff_t)64);
__m512 postMul35 = _mm512_permutex2var_ps(masLo18, pmMul22, masHi18);
__m512 postAdd23 = _mm512_permutex2var_ps(masLo18, pmAdd22, masHi18);
sum268 = _mm512_fmadd_ps(sum268, postMul35, postAdd23);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*0+(ptrdiff_t)0, 63>>cut14, sum268);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*0+(ptrdiff_t)6144, 4032>>cut14, sum268);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*0+(ptrdiff_t)12288, 258048>>cut14, sum268);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*0+(ptrdiff_t)18432, 65535-(262143>>cut14), sum268);
ptrdiff_t c33 = 0;
for (; c33 != 16; ++c33) {
__m512 wt359 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)0);
__m512 wt360 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)1024);
__m512 wt361 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)2048);
__m512 wt362 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)3072);
__m512 wt363 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)4096);
__m512 wt364 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)5120);
__m512 wt365 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)6144);
__m512 wt366 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)7168);
__m512 wt367 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)8192);
__m512 wt368 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)9216);
__m512 wt369 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)10240);
__m512 wt370 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)11264);
__m512 wt371 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)12288);
__m512 wt372 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)13312);
__m512 wt373 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)14336);
__m512 wt374 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)15360);
__m512 tmp10841 = _mm512_unpacklo_ps(wt359, wt360);
__m512 tmp10842 = _mm512_unpackhi_ps(wt359, wt360);
__m512 tmp10843 = _mm512_unpacklo_ps(wt361, wt362);
__m512 tmp10844 = _mm512_unpackhi_ps(wt361, wt362);
__m512 tmp10845 = _mm512_unpacklo_ps(wt363, wt364);
__m512 tmp10846 = _mm512_unpackhi_ps(wt363, wt364);
__m512 tmp10847 = _mm512_unpacklo_ps(wt365, wt366);
__m512 tmp10848 = _mm512_unpackhi_ps(wt365, wt366);
__m512 tmp10849 = _mm512_unpacklo_ps(wt367, wt368);
__m512 tmp10850 = _mm512_unpackhi_ps(wt367, wt368);
__m512 tmp10851 = _mm512_unpacklo_ps(wt369, wt370);
__m512 tmp10852 = _mm512_unpackhi_ps(wt369, wt370);
__m512 tmp10853 = _mm512_unpacklo_ps(wt371, wt372);
__m512 tmp10854 = _mm512_unpackhi_ps(wt371, wt372);
__m512 tmp10855 = _mm512_unpacklo_ps(wt373, wt374);
__m512 tmp10856 = _mm512_unpackhi_ps(wt373, wt374);
__m512 tmp10857 = _mm512_shuffle_ps(tmp10841, tmp10843, 68);
__m512 tmp10858 = _mm512_shuffle_ps(tmp10841, tmp10843, 238);
__m512 tmp10859 = _mm512_shuffle_ps(tmp10842, tmp10844, 68);
__m512 tmp10860 = _mm512_shuffle_ps(tmp10842, tmp10844, 238);
__m512 tmp10861 = _mm512_shuffle_ps(tmp10845, tmp10847, 68);
__m512 tmp10862 = _mm512_shuffle_ps(tmp10845, tmp10847, 238);
__m512 tmp10863 = _mm512_shuffle_ps(tmp10846, tmp10848, 68);
__m512 tmp10864 = _mm512_shuffle_ps(tmp10846, tmp10848, 238);
__m512 tmp10865 = _mm512_shuffle_ps(tmp10849, tmp10851, 68);
__m512 tmp10866 = _mm512_shuffle_ps(tmp10849, tmp10851, 238);
__m512 tmp10867 = _mm512_shuffle_ps(tmp10850, tmp10852, 68);
__m512 tmp10868 = _mm512_shuffle_ps(tmp10850, tmp10852, 238);
__m512 tmp10869 = _mm512_shuffle_ps(tmp10853, tmp10855, 68);
__m512 tmp10870 = _mm512_shuffle_ps(tmp10853, tmp10855, 238);
__m512 tmp10871 = _mm512_shuffle_ps(tmp10854, tmp10856, 68);
__m512 tmp10872 = _mm512_shuffle_ps(tmp10854, tmp10856, 238);
__m512 tmp10873 = _mm512_shuffle_f32x4(tmp10857, tmp10861, 136);
__m512 tmp10874 = _mm512_shuffle_f32x4(tmp10857, tmp10861, 221);
__m512 tmp10875 = _mm512_shuffle_f32x4(tmp10858, tmp10862, 136);
__m512 tmp10876 = _mm512_shuffle_f32x4(tmp10858, tmp10862, 221);
__m512 tmp10877 = _mm512_shuffle_f32x4(tmp10859, tmp10863, 136);
__m512 tmp10878 = _mm512_shuffle_f32x4(tmp10859, tmp10863, 221);
__m512 tmp10879 = _mm512_shuffle_f32x4(tmp10860, tmp10864, 136);
__m512 tmp10880 = _mm512_shuffle_f32x4(tmp10860, tmp10864, 221);
__m512 tmp10881 = _mm512_shuffle_f32x4(tmp10865, tmp10869, 136);
__m512 tmp10882 = _mm512_shuffle_f32x4(tmp10865, tmp10869, 221);
__m512 tmp10883 = _mm512_shuffle_f32x4(tmp10866, tmp10870, 136);
__m512 tmp10884 = _mm512_shuffle_f32x4(tmp10866, tmp10870, 221);
__m512 tmp10885 = _mm512_shuffle_f32x4(tmp10867, tmp10871, 136);
__m512 tmp10886 = _mm512_shuffle_f32x4(tmp10867, tmp10871, 221);
__m512 tmp10887 = _mm512_shuffle_f32x4(tmp10868, tmp10872, 136);
__m512 tmp10888 = _mm512_shuffle_f32x4(tmp10868, tmp10872, 221);
wt359 = _mm512_shuffle_f32x4(tmp10873, tmp10881, 136);
wt367 = _mm512_shuffle_f32x4(tmp10873, tmp10881, 221);
wt360 = _mm512_shuffle_f32x4(tmp10875, tmp10883, 136);
wt368 = _mm512_shuffle_f32x4(tmp10875, tmp10883, 221);
wt361 = _mm512_shuffle_f32x4(tmp10877, tmp10885, 136);
wt369 = _mm512_shuffle_f32x4(tmp10877, tmp10885, 221);
wt362 = _mm512_shuffle_f32x4(tmp10879, tmp10887, 136);
wt370 = _mm512_shuffle_f32x4(tmp10879, tmp10887, 221);
wt363 = _mm512_shuffle_f32x4(tmp10874, tmp10882, 136);
wt371 = _mm512_shuffle_f32x4(tmp10874, tmp10882, 221);
wt364 = _mm512_shuffle_f32x4(tmp10876, tmp10884, 136);
wt372 = _mm512_shuffle_f32x4(tmp10876, tmp10884, 221);
wt365 = _mm512_shuffle_f32x4(tmp10878, tmp10886, 136);
wt373 = _mm512_shuffle_f32x4(tmp10878, tmp10886, 221);
wt366 = _mm512_shuffle_f32x4(tmp10880, tmp10888, 136);
wt374 = _mm512_shuffle_f32x4(tmp10880, tmp10888, 221);
wt359 = _mm512_mul_ps(wt359, postMul35);
wt360 = _mm512_mul_ps(wt360, postMul35);
wt361 = _mm512_mul_ps(wt361, postMul35);
wt362 = _mm512_mul_ps(wt362, postMul35);
wt363 = _mm512_mul_ps(wt363, postMul35);
wt364 = _mm512_mul_ps(wt364, postMul35);
wt365 = _mm512_mul_ps(wt365, postMul35);
wt366 = _mm512_mul_ps(wt366, postMul35);
wt367 = _mm512_mul_ps(wt367, postMul35);
wt368 = _mm512_mul_ps(wt368, postMul35);
wt369 = _mm512_mul_ps(wt369, postMul35);
wt370 = _mm512_mul_ps(wt370, postMul35);
wt371 = _mm512_mul_ps(wt371, postMul35);
wt372 = _mm512_mul_ps(wt372, postMul35);
wt373 = _mm512_mul_ps(wt373, postMul35);
wt374 = _mm512_mul_ps(wt374, postMul35);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(1+16*c33)+(ptrdiff_t)0, 63>>cut14, wt359);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(2+16*c33)+(ptrdiff_t)0, 63>>cut14, wt360);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(3+16*c33)+(ptrdiff_t)0, 63>>cut14, wt361);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(4+16*c33)+(ptrdiff_t)0, 63>>cut14, wt362);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(5+16*c33)+(ptrdiff_t)0, 63>>cut14, wt363);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(6+16*c33)+(ptrdiff_t)0, 63>>cut14, wt364);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(7+16*c33)+(ptrdiff_t)0, 63>>cut14, wt365);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(8+16*c33)+(ptrdiff_t)0, 63>>cut14, wt366);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(9+16*c33)+(ptrdiff_t)0, 63>>cut14, wt367);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(10+16*c33)+(ptrdiff_t)0, 63>>cut14, wt368);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(11+16*c33)+(ptrdiff_t)0, 63>>cut14, wt369);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(12+16*c33)+(ptrdiff_t)0, 63>>cut14, wt370);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(13+16*c33)+(ptrdiff_t)0, 63>>cut14, wt371);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(14+16*c33)+(ptrdiff_t)0, 63>>cut14, wt372);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(15+16*c33)+(ptrdiff_t)0, 63>>cut14, wt373);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(16+16*c33)+(ptrdiff_t)0, 63>>cut14, wt374);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(1+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt359);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(2+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt360);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(3+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt361);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(4+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt362);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(5+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt363);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(6+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt364);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(7+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt365);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(8+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt366);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(9+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt367);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(10+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt368);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(11+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt369);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(12+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt370);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(13+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt371);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(14+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt372);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(15+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt373);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(16+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt374);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(1+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt359);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(2+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt360);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(3+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt361);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(4+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt362);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(5+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt363);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(6+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt364);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(7+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt365);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(8+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt366);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(9+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt367);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(10+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt368);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(11+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt369);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(12+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt370);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(13+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt371);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(14+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt372);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(15+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt373);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(16+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt374);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(1+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt359);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(2+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt360);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(3+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt361);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(4+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt362);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(5+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt363);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(6+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt364);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(7+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt365);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(8+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt366);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(9+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt367);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(10+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt368);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(11+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt369);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(12+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt370);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(13+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt371);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(14+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt372);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(15+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt373);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(16+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt374);
}
}
}
} else {
ptrdiff_t k117 = 496;
ptrdiff_t l49 = (size_t)(0+k117)/6;
ptrdiff_t cut13 = (size_t)(0+k117)%6;
__m512 sum266 = _mm512_maskz_loadu_ps(65535, biasPtr11+2048*i42+4*k117);
__m512i pmMul23 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd23 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo19 = _mm512_loadu_ps(bnPtr12+(ptrdiff_t)8*(k117+512*i42));
__m512 masHi19 = _mm512_maskz_loadu_ps(65535, bnPtr12+(ptrdiff_t)8*(k117+512*i42)+(ptrdiff_t)64);
__m512 postMul33 = _mm512_permutex2var_ps(masLo19, pmMul23, masHi19);
__m512 postAdd21 = _mm512_permutex2var_ps(masLo19, pmAdd23, masHi19);
sum266 = _mm512_fmadd_ps(sum266, postMul33, postAdd21);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*0+(ptrdiff_t)0, 63>>cut13, sum266);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*0+(ptrdiff_t)6144, 4032>>cut13, sum266);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*0+(ptrdiff_t)12288, 258048>>cut13, sum266);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*0+(ptrdiff_t)18432, 65535-(262143>>cut13), sum266);
ptrdiff_t c31 = 0;
for (; c31 != 16; ++c31) {
__m512 wt327 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)0);
__m512 wt328 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)1024);
__m512 wt329 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)2048);
__m512 wt330 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)3072);
__m512 wt331 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)4096);
__m512 wt332 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)5120);
__m512 wt333 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)6144);
__m512 wt334 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)7168);
__m512 wt335 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)8192);
__m512 wt336 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)9216);
__m512 wt337 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)10240);
__m512 wt338 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)11264);
__m512 wt339 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)12288);
__m512 wt340 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)13312);
__m512 wt341 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)14336);
__m512 wt342 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)15360);
__m512 tmp10889 = _mm512_unpacklo_ps(wt327, wt328);
__m512 tmp10890 = _mm512_unpackhi_ps(wt327, wt328);
__m512 tmp10891 = _mm512_unpacklo_ps(wt329, wt330);
__m512 tmp10892 = _mm512_unpackhi_ps(wt329, wt330);
__m512 tmp10893 = _mm512_unpacklo_ps(wt331, wt332);
__m512 tmp10894 = _mm512_unpackhi_ps(wt331, wt332);
__m512 tmp10895 = _mm512_unpacklo_ps(wt333, wt334);
__m512 tmp10896 = _mm512_unpackhi_ps(wt333, wt334);
__m512 tmp10897 = _mm512_unpacklo_ps(wt335, wt336);
__m512 tmp10898 = _mm512_unpackhi_ps(wt335, wt336);
__m512 tmp10899 = _mm512_unpacklo_ps(wt337, wt338);
__m512 tmp10900 = _mm512_unpackhi_ps(wt337, wt338);
__m512 tmp10901 = _mm512_unpacklo_ps(wt339, wt340);
__m512 tmp10902 = _mm512_unpackhi_ps(wt339, wt340);
__m512 tmp10903 = _mm512_unpacklo_ps(wt341, wt342);
__m512 tmp10904 = _mm512_unpackhi_ps(wt341, wt342);
__m512 tmp10905 = _mm512_shuffle_ps(tmp10889, tmp10891, 68);
__m512 tmp10906 = _mm512_shuffle_ps(tmp10889, tmp10891, 238);
__m512 tmp10907 = _mm512_shuffle_ps(tmp10890, tmp10892, 68);
__m512 tmp10908 = _mm512_shuffle_ps(tmp10890, tmp10892, 238);
__m512 tmp10909 = _mm512_shuffle_ps(tmp10893, tmp10895, 68);
__m512 tmp10910 = _mm512_shuffle_ps(tmp10893, tmp10895, 238);
__m512 tmp10911 = _mm512_shuffle_ps(tmp10894, tmp10896, 68);
__m512 tmp10912 = _mm512_shuffle_ps(tmp10894, tmp10896, 238);
__m512 tmp10913 = _mm512_shuffle_ps(tmp10897, tmp10899, 68);
__m512 tmp10914 = _mm512_shuffle_ps(tmp10897, tmp10899, 238);
__m512 tmp10915 = _mm512_shuffle_ps(tmp10898, tmp10900, 68);
__m512 tmp10916 = _mm512_shuffle_ps(tmp10898, tmp10900, 238);
__m512 tmp10917 = _mm512_shuffle_ps(tmp10901, tmp10903, 68);
__m512 tmp10918 = _mm512_shuffle_ps(tmp10901, tmp10903, 238);
__m512 tmp10919 = _mm512_shuffle_ps(tmp10902, tmp10904, 68);
__m512 tmp10920 = _mm512_shuffle_ps(tmp10902, tmp10904, 238);
__m512 tmp10921 = _mm512_shuffle_f32x4(tmp10905, tmp10909, 136);
__m512 tmp10922 = _mm512_shuffle_f32x4(tmp10905, tmp10909, 221);
__m512 tmp10923 = _mm512_shuffle_f32x4(tmp10906, tmp10910, 136);
__m512 tmp10924 = _mm512_shuffle_f32x4(tmp10906, tmp10910, 221);
__m512 tmp10925 = _mm512_shuffle_f32x4(tmp10907, tmp10911, 136);
__m512 tmp10926 = _mm512_shuffle_f32x4(tmp10907, tmp10911, 221);
__m512 tmp10927 = _mm512_shuffle_f32x4(tmp10908, tmp10912, 136);
__m512 tmp10928 = _mm512_shuffle_f32x4(tmp10908, tmp10912, 221);
__m512 tmp10929 = _mm512_shuffle_f32x4(tmp10913, tmp10917, 136);
__m512 tmp10930 = _mm512_shuffle_f32x4(tmp10913, tmp10917, 221);
__m512 tmp10931 = _mm512_shuffle_f32x4(tmp10914, tmp10918, 136);
__m512 tmp10932 = _mm512_shuffle_f32x4(tmp10914, tmp10918, 221);
__m512 tmp10933 = _mm512_shuffle_f32x4(tmp10915, tmp10919, 136);
__m512 tmp10934 = _mm512_shuffle_f32x4(tmp10915, tmp10919, 221);
__m512 tmp10935 = _mm512_shuffle_f32x4(tmp10916, tmp10920, 136);
__m512 tmp10936 = _mm512_shuffle_f32x4(tmp10916, tmp10920, 221);
wt327 = _mm512_shuffle_f32x4(tmp10921, tmp10929, 136);
wt335 = _mm512_shuffle_f32x4(tmp10921, tmp10929, 221);
wt328 = _mm512_shuffle_f32x4(tmp10923, tmp10931, 136);
wt336 = _mm512_shuffle_f32x4(tmp10923, tmp10931, 221);
wt329 = _mm512_shuffle_f32x4(tmp10925, tmp10933, 136);
wt337 = _mm512_shuffle_f32x4(tmp10925, tmp10933, 221);
wt330 = _mm512_shuffle_f32x4(tmp10927, tmp10935, 136);
wt338 = _mm512_shuffle_f32x4(tmp10927, tmp10935, 221);
wt331 = _mm512_shuffle_f32x4(tmp10922, tmp10930, 136);
wt339 = _mm512_shuffle_f32x4(tmp10922, tmp10930, 221);
wt332 = _mm512_shuffle_f32x4(tmp10924, tmp10932, 136);
wt340 = _mm512_shuffle_f32x4(tmp10924, tmp10932, 221);
wt333 = _mm512_shuffle_f32x4(tmp10926, tmp10934, 136);
wt341 = _mm512_shuffle_f32x4(tmp10926, tmp10934, 221);
wt334 = _mm512_shuffle_f32x4(tmp10928, tmp10936, 136);
wt342 = _mm512_shuffle_f32x4(tmp10928, tmp10936, 221);
wt327 = _mm512_mul_ps(wt327, postMul33);
wt328 = _mm512_mul_ps(wt328, postMul33);
wt329 = _mm512_mul_ps(wt329, postMul33);
wt330 = _mm512_mul_ps(wt330, postMul33);
wt331 = _mm512_mul_ps(wt331, postMul33);
wt332 = _mm512_mul_ps(wt332, postMul33);
wt333 = _mm512_mul_ps(wt333, postMul33);
wt334 = _mm512_mul_ps(wt334, postMul33);
wt335 = _mm512_mul_ps(wt335, postMul33);
wt336 = _mm512_mul_ps(wt336, postMul33);
wt337 = _mm512_mul_ps(wt337, postMul33);
wt338 = _mm512_mul_ps(wt338, postMul33);
wt339 = _mm512_mul_ps(wt339, postMul33);
wt340 = _mm512_mul_ps(wt340, postMul33);
wt341 = _mm512_mul_ps(wt341, postMul33);
wt342 = _mm512_mul_ps(wt342, postMul33);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(1+16*c31)+(ptrdiff_t)0, 63>>cut13, wt327);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(2+16*c31)+(ptrdiff_t)0, 63>>cut13, wt328);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(3+16*c31)+(ptrdiff_t)0, 63>>cut13, wt329);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(4+16*c31)+(ptrdiff_t)0, 63>>cut13, wt330);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(5+16*c31)+(ptrdiff_t)0, 63>>cut13, wt331);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(6+16*c31)+(ptrdiff_t)0, 63>>cut13, wt332);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(7+16*c31)+(ptrdiff_t)0, 63>>cut13, wt333);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(8+16*c31)+(ptrdiff_t)0, 63>>cut13, wt334);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(9+16*c31)+(ptrdiff_t)0, 63>>cut13, wt335);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(10+16*c31)+(ptrdiff_t)0, 63>>cut13, wt336);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(11+16*c31)+(ptrdiff_t)0, 63>>cut13, wt337);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(12+16*c31)+(ptrdiff_t)0, 63>>cut13, wt338);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(13+16*c31)+(ptrdiff_t)0, 63>>cut13, wt339);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(14+16*c31)+(ptrdiff_t)0, 63>>cut13, wt340);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(15+16*c31)+(ptrdiff_t)0, 63>>cut13, wt341);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(16+16*c31)+(ptrdiff_t)0, 63>>cut13, wt342);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(1+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt327);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(2+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt328);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(3+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt329);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(4+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt330);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(5+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt331);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(6+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt332);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(7+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt333);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(8+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt334);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(9+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt335);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(10+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt336);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(11+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt337);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(12+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt338);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(13+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt339);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(14+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt340);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(15+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt341);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(16+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt342);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(1+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt327);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(2+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt328);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(3+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt329);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(4+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt330);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(5+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt331);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(6+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt332);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(7+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt333);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(8+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt334);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(9+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt335);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(10+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt336);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(11+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt337);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(12+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt338);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(13+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt339);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(14+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt340);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(15+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt341);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(16+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt342);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(1+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt327);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(2+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt328);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(3+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt329);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(4+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt330);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(5+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt331);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(6+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt332);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(7+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt333);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(8+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt334);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(9+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt335);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(10+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt336);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(11+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt337);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(12+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt338);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(13+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt339);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(14+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt340);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(15+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt341);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(16+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt342);
}
}
}
}
}

static void ResNeXt50OneArrangeWts6(ResNeXt50ThreaderTeam1* team46, char** tensors65) {
ResNeXt50ThreaderTask1 task69;
task69.callee1 = ResNeXt50OneArrangeWts6Callee1;
task69.any1 = tensors65;
task69.nd1 = 3;
task69.hull1[0] = 16;
task69.hull1[1] = 1;
task69.hull1[2] = 1;
ResNeXt50ThreaderDo1(team46, &task69);
}

static void ResNeXt50OneArrangeDats6Callee1(ResNeXt50ThreaderTask1* task70, int64_t* pt40) {
char** tensors68 = task70->any1;
ptrdiff_t s35 = pt40[0];
ptrdiff_t c34 = pt40[1];
char*restrict datPtr20 = tensors68[0]+(ptrdiff_t)0+(ptrdiff_t)2618560*0+(ptrdiff_t)802816*0;
char*restrict arranged12 = tensors68[1]+(ptrdiff_t)2618560*0+(ptrdiff_t)802816*0;
ptrdiff_t ii29 = 1;
for (ptrdiff_t i43 = 0; i43 < ii29; ++i43) {
ptrdiff_t j36 = 1*c34;
ptrdiff_t jj39 = j36+0;
for (; j36 != 12; ++j36) {
ptrdiff_t k119 = 128*s35;
ptrdiff_t kk37 = k119+128;
for (; k119 < kk37; ++k119) {
__m512 dat2003 = _mm512_maskz_loadu_ps(65535, datPtr20+802816*i43+256*j36+3136*k119+(ptrdiff_t)0);
__m512 dat2004 = _mm512_maskz_loadu_ps(65535, datPtr20+802816*i43+256*j36+3136*k119+(ptrdiff_t)64);
__m512 dat2005 = _mm512_maskz_loadu_ps(65535, datPtr20+802816*i43+256*j36+3136*k119+(ptrdiff_t)128);
__m512 dat2006 = _mm512_maskz_loadu_ps(65535, datPtr20+802816*i43+256*j36+3136*k119+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged12+802816*i43+65536*j36+256*k119+(ptrdiff_t)0, 65535, dat2003);
_mm512_mask_storeu_ps(arranged12+802816*i43+65536*j36+256*k119+(ptrdiff_t)64, 65535, dat2004);
_mm512_mask_storeu_ps(arranged12+802816*i43+65536*j36+256*k119+(ptrdiff_t)128, 65535, dat2005);
_mm512_mask_storeu_ps(arranged12+802816*i43+65536*j36+256*k119+(ptrdiff_t)192, 65535, dat2006);
}
if (j36 >= jj39) goto next6;
}
ptrdiff_t k120 = 128*s35;
ptrdiff_t kk38 = k120+128;
for (; k120 < kk38; ++k120) {
__m512 dat2007 = _mm512_maskz_loadu_ps(65535, datPtr20+802816*i43+256*j36+3136*k120+(ptrdiff_t)0);
_mm512_mask_storeu_ps(arranged12+802816*i43+65536*j36+64*k120+(ptrdiff_t)0, 65535, dat2007);
}
next6:;
}
}

static void ResNeXt50OneArrangeDats6(ResNeXt50ThreaderTeam1* team47, char** tensors67) {
ResNeXt50ThreaderTask1 task71;
task71.callee1 = ResNeXt50OneArrangeDats6Callee1;
task71.any1 = tensors67;
task71.nd1 = 4;
task71.hull1[0] = 2;
task71.hull1[1] = 13;
task71.hull1[2] = 1;
task71.hull1[3] = 1;
ResNeXt50ThreaderDo1(team47, &task71);
}

static void ResNeXt50OneApply6Callee1(ResNeXt50ThreaderTask1* task72, int64_t* pt41) {
void** pair16 = task72->any1;
char** tensors70 = pair16[0];
ptrdiff_t e21 = 0;
ptrdiff_t g23 = 0;
ptrdiff_t d14 = pt41[1];
ptrdiff_t w54 = pt41[0];
char*restrict arrangedWts6 = tensors70[0]+1712128*e21+(ptrdiff_t)526336*1*g23;
char*restrict arrangedDats6 = tensors70[1]+2618560*e21+(ptrdiff_t)802816*1*g23;
char*restrict datPtr21 = tensors70[2]+(ptrdiff_t)1605632*1*g23;
char*restrict datPtr22 = tensors70[3]+(ptrdiff_t)1605632*1*g23;
ptrdiff_t ii30 = 1;
for (ptrdiff_t i44 = 0; i44 < ii30; ++i44) {
ptrdiff_t j37 = 1*d14;
ptrdiff_t jj40 = j37+0;
for (; j37 != 12; ++j37) {
ptrdiff_t k121 = 2*w54;
ptrdiff_t kk39 = k121+1;
for (; k121 != 85; ++k121) {
ptrdiff_t s36 = -1;
__m512 sum269 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)24));
__m512 sum273 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)28));
__m512 sum277 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)32));
__m512 sum281 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)36));
__m512 sum285 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)40));
__m512 sum289 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)44));
__m512 sum270 = sum269;
__m512 sum271 = sum269;
__m512 sum272 = sum269;
__m512 sum274 = sum273;
__m512 sum275 = sum273;
__m512 sum276 = sum273;
__m512 sum278 = sum277;
__m512 sum279 = sum277;
__m512 sum280 = sum277;
__m512 sum282 = sum281;
__m512 sum283 = sum281;
__m512 sum284 = sum281;
__m512 sum286 = sum285;
__m512 sum287 = sum285;
__m512 sum288 = sum285;
__m512 sum290 = sum289;
__m512 sum291 = sum289;
__m512 sum292 = sum289;
for (s36 = 0; s36 < 256; ++s36) {
__m512 dat2008 = _mm512_loadu_ps(arrangedDats6+802816*i44+65536*j37+256*s36+(ptrdiff_t)0);
__m512 dat2009 = _mm512_loadu_ps(arrangedDats6+802816*i44+65536*j37+256*s36+(ptrdiff_t)64);
__m512 dat2010 = _mm512_loadu_ps(arrangedDats6+802816*i44+65536*j37+256*s36+(ptrdiff_t)128);
__m512 dat2011 = _mm512_loadu_ps(arrangedDats6+802816*i44+65536*j37+256*s36+(ptrdiff_t)192);
__m512 wt375 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)24));
sum269 = _mm512_fmadd_ps(wt375, dat2008, sum269);
sum270 = _mm512_fmadd_ps(wt375, dat2009, sum270);
sum271 = _mm512_fmadd_ps(wt375, dat2010, sum271);
sum272 = _mm512_fmadd_ps(wt375, dat2011, sum272);
__m512 wt376 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)28));
sum273 = _mm512_fmadd_ps(wt376, dat2008, sum273);
sum274 = _mm512_fmadd_ps(wt376, dat2009, sum274);
sum275 = _mm512_fmadd_ps(wt376, dat2010, sum275);
sum276 = _mm512_fmadd_ps(wt376, dat2011, sum276);
__m512 wt377 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)32));
sum277 = _mm512_fmadd_ps(wt377, dat2008, sum277);
sum278 = _mm512_fmadd_ps(wt377, dat2009, sum278);
sum279 = _mm512_fmadd_ps(wt377, dat2010, sum279);
sum280 = _mm512_fmadd_ps(wt377, dat2011, sum280);
__m512 wt378 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)36));
sum281 = _mm512_fmadd_ps(wt378, dat2008, sum281);
sum282 = _mm512_fmadd_ps(wt378, dat2009, sum282);
sum283 = _mm512_fmadd_ps(wt378, dat2010, sum283);
sum284 = _mm512_fmadd_ps(wt378, dat2011, sum284);
__m512 wt379 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)40));
sum285 = _mm512_fmadd_ps(wt379, dat2008, sum285);
sum286 = _mm512_fmadd_ps(wt379, dat2009, sum286);
sum287 = _mm512_fmadd_ps(wt379, dat2010, sum287);
sum288 = _mm512_fmadd_ps(wt379, dat2011, sum288);
__m512 wt380 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)44));
sum289 = _mm512_fmadd_ps(wt380, dat2008, sum289);
sum290 = _mm512_fmadd_ps(wt380, dat2009, sum290);
sum291 = _mm512_fmadd_ps(wt380, dat2010, sum291);
sum292 = _mm512_fmadd_ps(wt380, dat2011, sum292);
}
sum269 = _mm512_add_ps(sum269, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)0));
sum270 = _mm512_add_ps(sum270, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)64));
sum271 = _mm512_add_ps(sum271, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)128));
sum272 = _mm512_add_ps(sum272, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)192));
sum269 = _mm512_max_ps(_mm512_setzero_ps(), sum269);
sum270 = _mm512_max_ps(_mm512_setzero_ps(), sum270);
sum271 = _mm512_max_ps(_mm512_setzero_ps(), sum271);
sum272 = _mm512_max_ps(_mm512_setzero_ps(), sum272);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)0, 65535, sum269);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)64, 65535, sum270);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)128, 65535, sum271);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)192, 65535, sum272);
sum273 = _mm512_add_ps(sum273, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3136));
sum274 = _mm512_add_ps(sum274, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3200));
sum275 = _mm512_add_ps(sum275, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3264));
sum276 = _mm512_add_ps(sum276, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3328));
sum273 = _mm512_max_ps(_mm512_setzero_ps(), sum273);
sum274 = _mm512_max_ps(_mm512_setzero_ps(), sum274);
sum275 = _mm512_max_ps(_mm512_setzero_ps(), sum275);
sum276 = _mm512_max_ps(_mm512_setzero_ps(), sum276);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3136, 65535, sum273);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3200, 65535, sum274);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3264, 65535, sum275);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3328, 65535, sum276);
sum277 = _mm512_add_ps(sum277, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)6272));
sum278 = _mm512_add_ps(sum278, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)6336));
sum279 = _mm512_add_ps(sum279, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)6400));
sum280 = _mm512_add_ps(sum280, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)6464));
sum277 = _mm512_max_ps(_mm512_setzero_ps(), sum277);
sum278 = _mm512_max_ps(_mm512_setzero_ps(), sum278);
sum279 = _mm512_max_ps(_mm512_setzero_ps(), sum279);
sum280 = _mm512_max_ps(_mm512_setzero_ps(), sum280);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)6272, 65535, sum277);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)6336, 65535, sum278);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)6400, 65535, sum279);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)6464, 65535, sum280);
sum281 = _mm512_add_ps(sum281, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)9408));
sum282 = _mm512_add_ps(sum282, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)9472));
sum283 = _mm512_add_ps(sum283, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)9536));
sum284 = _mm512_add_ps(sum284, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)9600));
sum281 = _mm512_max_ps(_mm512_setzero_ps(), sum281);
sum282 = _mm512_max_ps(_mm512_setzero_ps(), sum282);
sum283 = _mm512_max_ps(_mm512_setzero_ps(), sum283);
sum284 = _mm512_max_ps(_mm512_setzero_ps(), sum284);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)9408, 65535, sum281);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)9472, 65535, sum282);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)9536, 65535, sum283);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)9600, 65535, sum284);
sum285 = _mm512_add_ps(sum285, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)12544));
sum286 = _mm512_add_ps(sum286, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)12608));
sum287 = _mm512_add_ps(sum287, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)12672));
sum288 = _mm512_add_ps(sum288, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)12736));
sum285 = _mm512_max_ps(_mm512_setzero_ps(), sum285);
sum286 = _mm512_max_ps(_mm512_setzero_ps(), sum286);
sum287 = _mm512_max_ps(_mm512_setzero_ps(), sum287);
sum288 = _mm512_max_ps(_mm512_setzero_ps(), sum288);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)12544, 65535, sum285);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)12608, 65535, sum286);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)12672, 65535, sum287);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)12736, 65535, sum288);
sum289 = _mm512_add_ps(sum289, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)15680));
sum290 = _mm512_add_ps(sum290, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)15744));
sum291 = _mm512_add_ps(sum291, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)15808));
sum292 = _mm512_add_ps(sum292, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)15872));
sum289 = _mm512_max_ps(_mm512_setzero_ps(), sum289);
sum290 = _mm512_max_ps(_mm512_setzero_ps(), sum290);
sum291 = _mm512_max_ps(_mm512_setzero_ps(), sum291);
sum292 = _mm512_max_ps(_mm512_setzero_ps(), sum292);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)15680, 65535, sum289);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)15744, 65535, sum290);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)15808, 65535, sum291);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)15872, 65535, sum292);
if (k121 >= kk39) return;
}
ptrdiff_t s37 = -1;
__m512 sum293 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+8*s37+(ptrdiff_t)8));
__m512 sum297 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+8*s37+(ptrdiff_t)12));
__m512 sum294 = sum293;
__m512 sum295 = sum293;
__m512 sum296 = sum293;
__m512 sum298 = sum297;
__m512 sum299 = sum297;
__m512 sum300 = sum297;
for (s37 = 0; s37 < 256; ++s37) {
__m512 dat2012 = _mm512_loadu_ps(arrangedDats6+802816*i44+65536*j37+256*s37+(ptrdiff_t)0);
__m512 dat2013 = _mm512_loadu_ps(arrangedDats6+802816*i44+65536*j37+256*s37+(ptrdiff_t)64);
__m512 dat2014 = _mm512_loadu_ps(arrangedDats6+802816*i44+65536*j37+256*s37+(ptrdiff_t)128);
__m512 dat2015 = _mm512_loadu_ps(arrangedDats6+802816*i44+65536*j37+256*s37+(ptrdiff_t)192);
__m512 wt381 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+8*s37+(ptrdiff_t)8));
sum293 = _mm512_fmadd_ps(wt381, dat2012, sum293);
sum294 = _mm512_fmadd_ps(wt381, dat2013, sum294);
sum295 = _mm512_fmadd_ps(wt381, dat2014, sum295);
sum296 = _mm512_fmadd_ps(wt381, dat2015, sum296);
__m512 wt382 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+8*s37+(ptrdiff_t)12));
sum297 = _mm512_fmadd_ps(wt382, dat2012, sum297);
sum298 = _mm512_fmadd_ps(wt382, dat2013, sum298);
sum299 = _mm512_fmadd_ps(wt382, dat2014, sum299);
sum300 = _mm512_fmadd_ps(wt382, dat2015, sum300);
}
sum293 = _mm512_add_ps(sum293, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)0));
sum294 = _mm512_add_ps(sum294, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)64));
sum295 = _mm512_add_ps(sum295, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)128));
sum296 = _mm512_add_ps(sum296, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)192));
sum293 = _mm512_max_ps(_mm512_setzero_ps(), sum293);
sum294 = _mm512_max_ps(_mm512_setzero_ps(), sum294);
sum295 = _mm512_max_ps(_mm512_setzero_ps(), sum295);
sum296 = _mm512_max_ps(_mm512_setzero_ps(), sum296);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)0, 65535, sum293);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)64, 65535, sum294);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)128, 65535, sum295);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)192, 65535, sum296);
sum297 = _mm512_add_ps(sum297, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3136));
sum298 = _mm512_add_ps(sum298, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3200));
sum299 = _mm512_add_ps(sum299, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3264));
sum300 = _mm512_add_ps(sum300, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3328));
sum297 = _mm512_max_ps(_mm512_setzero_ps(), sum297);
sum298 = _mm512_max_ps(_mm512_setzero_ps(), sum298);
sum299 = _mm512_max_ps(_mm512_setzero_ps(), sum299);
sum300 = _mm512_max_ps(_mm512_setzero_ps(), sum300);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3136, 65535, sum297);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3200, 65535, sum298);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3264, 65535, sum299);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3328, 65535, sum300);
if (j37 >= jj40) return;
}
ptrdiff_t k122 = 2*w54;
ptrdiff_t kk40 = k122+1;
for (; k122 != 85; ++k122) {
ptrdiff_t s38 = -1;
__m512 sum301 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)24));
__m512 sum302 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)28));
__m512 sum303 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)32));
__m512 sum304 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)36));
__m512 sum305 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)40));
__m512 sum306 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)44));
for (s38 = 0; s38 < 256; ++s38) {
__m512 dat2016 = _mm512_loadu_ps(arrangedDats6+802816*i44+65536*j37+64*s38+(ptrdiff_t)0);
__m512 wt383 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)24));
sum301 = _mm512_fmadd_ps(wt383, dat2016, sum301);
__m512 wt384 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)28));
sum302 = _mm512_fmadd_ps(wt384, dat2016, sum302);
__m512 wt385 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)32));
sum303 = _mm512_fmadd_ps(wt385, dat2016, sum303);
__m512 wt386 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)36));
sum304 = _mm512_fmadd_ps(wt386, dat2016, sum304);
__m512 wt387 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)40));
sum305 = _mm512_fmadd_ps(wt387, dat2016, sum305);
__m512 wt388 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)44));
sum306 = _mm512_fmadd_ps(wt388, dat2016, sum306);
}
sum301 = _mm512_add_ps(sum301, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k122+(ptrdiff_t)0));
sum301 = _mm512_max_ps(_mm512_setzero_ps(), sum301);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k122+(ptrdiff_t)0, 65535, sum301);
sum302 = _mm512_add_ps(sum302, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k122+(ptrdiff_t)3136));
sum302 = _mm512_max_ps(_mm512_setzero_ps(), sum302);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k122+(ptrdiff_t)3136, 65535, sum302);
sum303 = _mm512_add_ps(sum303, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k122+(ptrdiff_t)6272));
sum303 = _mm512_max_ps(_mm512_setzero_ps(), sum303);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k122+(ptrdiff_t)6272, 65535, sum303);
sum304 = _mm512_add_ps(sum304, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k122+(ptrdiff_t)9408));
sum304 = _mm512_max_ps(_mm512_setzero_ps(), sum304);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k122+(ptrdiff_t)9408, 65535, sum304);
sum305 = _mm512_add_ps(sum305, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k122+(ptrdiff_t)12544));
sum305 = _mm512_max_ps(_mm512_setzero_ps(), sum305);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k122+(ptrdiff_t)12544, 65535, sum305);
sum306 = _mm512_add_ps(sum306, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k122+(ptrdiff_t)15680));
sum306 = _mm512_max_ps(_mm512_setzero_ps(), sum306);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k122+(ptrdiff_t)15680, 65535, sum306);
if (k122 >= kk40) return;
}
ptrdiff_t s39 = -1;
__m512 sum307 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+8*s39+(ptrdiff_t)8));
__m512 sum308 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+8*s39+(ptrdiff_t)12));
for (s39 = 0; s39 < 256; ++s39) {
__m512 dat2017 = _mm512_loadu_ps(arrangedDats6+802816*i44+65536*j37+64*s39+(ptrdiff_t)0);
__m512 wt389 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+8*s39+(ptrdiff_t)8));
sum307 = _mm512_fmadd_ps(wt389, dat2017, sum307);
__m512 wt390 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+8*s39+(ptrdiff_t)12));
sum308 = _mm512_fmadd_ps(wt390, dat2017, sum308);
}
sum307 = _mm512_add_ps(sum307, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k122+(ptrdiff_t)0));
sum307 = _mm512_max_ps(_mm512_setzero_ps(), sum307);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k122+(ptrdiff_t)0, 65535, sum307);
sum308 = _mm512_add_ps(sum308, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k122+(ptrdiff_t)3136));
sum308 = _mm512_max_ps(_mm512_setzero_ps(), sum308);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k122+(ptrdiff_t)3136, 65535, sum308);
}
}

static void ResNeXt50OneApply6(ResNeXt50ThreaderTeam1* team48, char** tensors69) {
void* pair15[] = {tensors69, 0};
ResNeXt50ThreaderTask1 task73;
task73.callee1 = ResNeXt50OneApply6Callee1;
task73.any1 = pair15;
task73.nd1 = 3;
task73.hull1[0] = 43;
task73.hull1[1] = 13;
task73.hull1[2] = 1;
ResNeXt50ThreaderDo1(team48, &task73);
}

static void ResNeXt50OneArrangeWts7Callee1(ResNeXt50ThreaderTask1* task74, int64_t* pt42) {
char** tensors72 = task74->any1;
ptrdiff_t b70 = pt42[0];
char*restrict wtPtr12 = tensors72[0]+(ptrdiff_t)3340*0+(ptrdiff_t)524288*0;
char*restrict biasPtr12 = tensors72[1]+(ptrdiff_t)1024*0;
char*restrict bnPtr13 = tensors72[2]+(ptrdiff_t)8*256*0;
char*restrict arranged13 = tensors72[3]+(ptrdiff_t)856064*0+(ptrdiff_t)525312*0;
ptrdiff_t ii31 = 1;
for (ptrdiff_t i45 = 0; i45 < ii31; ++i45) {
ptrdiff_t j38 = 1*b70;
ptrdiff_t jj41 = j38+1;
for (; j38 < jj41; ++j38) {
if (j38 < 15) {
ptrdiff_t k124 = 0+16*(j38-0);
ptrdiff_t l52 = (size_t)(0+k124)/6;
ptrdiff_t cut16 = (size_t)(0+k124)%6;
switch (cut16) {
case 0:;
case 2: {
__m512 sum310 = _mm512_maskz_loadu_ps(65535, biasPtr12+1024*i45+4*k124);
__m512i pmMul24 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd24 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo20 = _mm512_loadu_ps(bnPtr13+(ptrdiff_t)8*(k124+256*i45));
__m512 masHi20 = _mm512_maskz_loadu_ps(65535, bnPtr13+(ptrdiff_t)8*(k124+256*i45)+(ptrdiff_t)64);
__m512 postMul37 = _mm512_permutex2var_ps(masLo20, pmMul24, masHi20);
__m512 postAdd25 = _mm512_permutex2var_ps(masLo20, pmAdd24, masHi20);
sum310 = _mm512_fmadd_ps(sum310, postMul37, postAdd25);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*0+(ptrdiff_t)0, 63>>cut16, sum310);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*0+(ptrdiff_t)12288, 4032>>cut16, sum310);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*0+(ptrdiff_t)24576, 65535-(4095>>cut16), sum310);
ptrdiff_t c36 = 0;
for (; c36 != 32; ++c36) {
__m512 wt407 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)0);
__m512 wt408 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)2048);
__m512 wt409 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)4096);
__m512 wt410 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)6144);
__m512 wt411 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)8192);
__m512 wt412 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)10240);
__m512 wt413 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)12288);
__m512 wt414 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)14336);
__m512 wt415 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)16384);
__m512 wt416 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)18432);
__m512 wt417 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)20480);
__m512 wt418 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)22528);
__m512 wt419 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)24576);
__m512 wt420 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)26624);
__m512 wt421 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)28672);
__m512 wt422 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)30720);
__m512 tmp10937 = _mm512_unpacklo_ps(wt407, wt408);
__m512 tmp10938 = _mm512_unpackhi_ps(wt407, wt408);
__m512 tmp10939 = _mm512_unpacklo_ps(wt409, wt410);
__m512 tmp10940 = _mm512_unpackhi_ps(wt409, wt410);
__m512 tmp10941 = _mm512_unpacklo_ps(wt411, wt412);
__m512 tmp10942 = _mm512_unpackhi_ps(wt411, wt412);
__m512 tmp10943 = _mm512_unpacklo_ps(wt413, wt414);
__m512 tmp10944 = _mm512_unpackhi_ps(wt413, wt414);
__m512 tmp10945 = _mm512_unpacklo_ps(wt415, wt416);
__m512 tmp10946 = _mm512_unpackhi_ps(wt415, wt416);
__m512 tmp10947 = _mm512_unpacklo_ps(wt417, wt418);
__m512 tmp10948 = _mm512_unpackhi_ps(wt417, wt418);
__m512 tmp10949 = _mm512_unpacklo_ps(wt419, wt420);
__m512 tmp10950 = _mm512_unpackhi_ps(wt419, wt420);
__m512 tmp10951 = _mm512_unpacklo_ps(wt421, wt422);
__m512 tmp10952 = _mm512_unpackhi_ps(wt421, wt422);
__m512 tmp10953 = _mm512_shuffle_ps(tmp10937, tmp10939, 68);
__m512 tmp10954 = _mm512_shuffle_ps(tmp10937, tmp10939, 238);
__m512 tmp10955 = _mm512_shuffle_ps(tmp10938, tmp10940, 68);
__m512 tmp10956 = _mm512_shuffle_ps(tmp10938, tmp10940, 238);
__m512 tmp10957 = _mm512_shuffle_ps(tmp10941, tmp10943, 68);
__m512 tmp10958 = _mm512_shuffle_ps(tmp10941, tmp10943, 238);
__m512 tmp10959 = _mm512_shuffle_ps(tmp10942, tmp10944, 68);
__m512 tmp10960 = _mm512_shuffle_ps(tmp10942, tmp10944, 238);
__m512 tmp10961 = _mm512_shuffle_ps(tmp10945, tmp10947, 68);
__m512 tmp10962 = _mm512_shuffle_ps(tmp10945, tmp10947, 238);
__m512 tmp10963 = _mm512_shuffle_ps(tmp10946, tmp10948, 68);
__m512 tmp10964 = _mm512_shuffle_ps(tmp10946, tmp10948, 238);
__m512 tmp10965 = _mm512_shuffle_ps(tmp10949, tmp10951, 68);
__m512 tmp10966 = _mm512_shuffle_ps(tmp10949, tmp10951, 238);
__m512 tmp10967 = _mm512_shuffle_ps(tmp10950, tmp10952, 68);
__m512 tmp10968 = _mm512_shuffle_ps(tmp10950, tmp10952, 238);
__m512 tmp10969 = _mm512_shuffle_f32x4(tmp10953, tmp10957, 136);
__m512 tmp10970 = _mm512_shuffle_f32x4(tmp10953, tmp10957, 221);
__m512 tmp10971 = _mm512_shuffle_f32x4(tmp10954, tmp10958, 136);
__m512 tmp10972 = _mm512_shuffle_f32x4(tmp10954, tmp10958, 221);
__m512 tmp10973 = _mm512_shuffle_f32x4(tmp10955, tmp10959, 136);
__m512 tmp10974 = _mm512_shuffle_f32x4(tmp10955, tmp10959, 221);
__m512 tmp10975 = _mm512_shuffle_f32x4(tmp10956, tmp10960, 136);
__m512 tmp10976 = _mm512_shuffle_f32x4(tmp10956, tmp10960, 221);
__m512 tmp10977 = _mm512_shuffle_f32x4(tmp10961, tmp10965, 136);
__m512 tmp10978 = _mm512_shuffle_f32x4(tmp10961, tmp10965, 221);
__m512 tmp10979 = _mm512_shuffle_f32x4(tmp10962, tmp10966, 136);
__m512 tmp10980 = _mm512_shuffle_f32x4(tmp10962, tmp10966, 221);
__m512 tmp10981 = _mm512_shuffle_f32x4(tmp10963, tmp10967, 136);
__m512 tmp10982 = _mm512_shuffle_f32x4(tmp10963, tmp10967, 221);
__m512 tmp10983 = _mm512_shuffle_f32x4(tmp10964, tmp10968, 136);
__m512 tmp10984 = _mm512_shuffle_f32x4(tmp10964, tmp10968, 221);
wt407 = _mm512_shuffle_f32x4(tmp10969, tmp10977, 136);
wt415 = _mm512_shuffle_f32x4(tmp10969, tmp10977, 221);
wt408 = _mm512_shuffle_f32x4(tmp10971, tmp10979, 136);
wt416 = _mm512_shuffle_f32x4(tmp10971, tmp10979, 221);
wt409 = _mm512_shuffle_f32x4(tmp10973, tmp10981, 136);
wt417 = _mm512_shuffle_f32x4(tmp10973, tmp10981, 221);
wt410 = _mm512_shuffle_f32x4(tmp10975, tmp10983, 136);
wt418 = _mm512_shuffle_f32x4(tmp10975, tmp10983, 221);
wt411 = _mm512_shuffle_f32x4(tmp10970, tmp10978, 136);
wt419 = _mm512_shuffle_f32x4(tmp10970, tmp10978, 221);
wt412 = _mm512_shuffle_f32x4(tmp10972, tmp10980, 136);
wt420 = _mm512_shuffle_f32x4(tmp10972, tmp10980, 221);
wt413 = _mm512_shuffle_f32x4(tmp10974, tmp10982, 136);
wt421 = _mm512_shuffle_f32x4(tmp10974, tmp10982, 221);
wt414 = _mm512_shuffle_f32x4(tmp10976, tmp10984, 136);
wt422 = _mm512_shuffle_f32x4(tmp10976, tmp10984, 221);
wt407 = _mm512_mul_ps(wt407, postMul37);
wt408 = _mm512_mul_ps(wt408, postMul37);
wt409 = _mm512_mul_ps(wt409, postMul37);
wt410 = _mm512_mul_ps(wt410, postMul37);
wt411 = _mm512_mul_ps(wt411, postMul37);
wt412 = _mm512_mul_ps(wt412, postMul37);
wt413 = _mm512_mul_ps(wt413, postMul37);
wt414 = _mm512_mul_ps(wt414, postMul37);
wt415 = _mm512_mul_ps(wt415, postMul37);
wt416 = _mm512_mul_ps(wt416, postMul37);
wt417 = _mm512_mul_ps(wt417, postMul37);
wt418 = _mm512_mul_ps(wt418, postMul37);
wt419 = _mm512_mul_ps(wt419, postMul37);
wt420 = _mm512_mul_ps(wt420, postMul37);
wt421 = _mm512_mul_ps(wt421, postMul37);
wt422 = _mm512_mul_ps(wt422, postMul37);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(1+16*c36)+(ptrdiff_t)0, 63>>cut16, wt407);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(2+16*c36)+(ptrdiff_t)0, 63>>cut16, wt408);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(3+16*c36)+(ptrdiff_t)0, 63>>cut16, wt409);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(4+16*c36)+(ptrdiff_t)0, 63>>cut16, wt410);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(5+16*c36)+(ptrdiff_t)0, 63>>cut16, wt411);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(6+16*c36)+(ptrdiff_t)0, 63>>cut16, wt412);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(7+16*c36)+(ptrdiff_t)0, 63>>cut16, wt413);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(8+16*c36)+(ptrdiff_t)0, 63>>cut16, wt414);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(9+16*c36)+(ptrdiff_t)0, 63>>cut16, wt415);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(10+16*c36)+(ptrdiff_t)0, 63>>cut16, wt416);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(11+16*c36)+(ptrdiff_t)0, 63>>cut16, wt417);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(12+16*c36)+(ptrdiff_t)0, 63>>cut16, wt418);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(13+16*c36)+(ptrdiff_t)0, 63>>cut16, wt419);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(14+16*c36)+(ptrdiff_t)0, 63>>cut16, wt420);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(15+16*c36)+(ptrdiff_t)0, 63>>cut16, wt421);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(16+16*c36)+(ptrdiff_t)0, 63>>cut16, wt422);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(1+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt407);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(2+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt408);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(3+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt409);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(4+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt410);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(5+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt411);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(6+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt412);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(7+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt413);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(8+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt414);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(9+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt415);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(10+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt416);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(11+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt417);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(12+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt418);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(13+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt419);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(14+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt420);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(15+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt421);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(16+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt422);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(1+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt407);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(2+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt408);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(3+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt409);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(4+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt410);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(5+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt411);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(6+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt412);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(7+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt413);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(8+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt414);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(9+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt415);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(10+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt416);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(11+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt417);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(12+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt418);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(13+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt419);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(14+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt420);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(15+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt421);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(16+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt422);
}
break;
}
default: {
cut16 = 4;
__m512 sum311 = _mm512_maskz_loadu_ps(65535, biasPtr12+1024*i45+4*k124);
__m512i pmMul25 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd25 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo21 = _mm512_loadu_ps(bnPtr13+(ptrdiff_t)8*(k124+256*i45));
__m512 masHi21 = _mm512_maskz_loadu_ps(65535, bnPtr13+(ptrdiff_t)8*(k124+256*i45)+(ptrdiff_t)64);
__m512 postMul38 = _mm512_permutex2var_ps(masLo21, pmMul25, masHi21);
__m512 postAdd26 = _mm512_permutex2var_ps(masLo21, pmAdd25, masHi21);
sum311 = _mm512_fmadd_ps(sum311, postMul38, postAdd26);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*0+(ptrdiff_t)0, 63>>cut16, sum311);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*0+(ptrdiff_t)12288, 4032>>cut16, sum311);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*0+(ptrdiff_t)24576, 258048>>cut16, sum311);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*0+(ptrdiff_t)36864, 65535-(262143>>cut16), sum311);
ptrdiff_t c37 = 0;
for (; c37 != 32; ++c37) {
__m512 wt423 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)0);
__m512 wt424 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)2048);
__m512 wt425 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)4096);
__m512 wt426 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)6144);
__m512 wt427 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)8192);
__m512 wt428 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)10240);
__m512 wt429 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)12288);
__m512 wt430 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)14336);
__m512 wt431 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)16384);
__m512 wt432 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)18432);
__m512 wt433 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)20480);
__m512 wt434 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)22528);
__m512 wt435 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)24576);
__m512 wt436 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)26624);
__m512 wt437 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)28672);
__m512 wt438 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)30720);
__m512 tmp10985 = _mm512_unpacklo_ps(wt423, wt424);
__m512 tmp10986 = _mm512_unpackhi_ps(wt423, wt424);
__m512 tmp10987 = _mm512_unpacklo_ps(wt425, wt426);
__m512 tmp10988 = _mm512_unpackhi_ps(wt425, wt426);
__m512 tmp10989 = _mm512_unpacklo_ps(wt427, wt428);
__m512 tmp10990 = _mm512_unpackhi_ps(wt427, wt428);
__m512 tmp10991 = _mm512_unpacklo_ps(wt429, wt430);
__m512 tmp10992 = _mm512_unpackhi_ps(wt429, wt430);
__m512 tmp10993 = _mm512_unpacklo_ps(wt431, wt432);
__m512 tmp10994 = _mm512_unpackhi_ps(wt431, wt432);
__m512 tmp10995 = _mm512_unpacklo_ps(wt433, wt434);
__m512 tmp10996 = _mm512_unpackhi_ps(wt433, wt434);
__m512 tmp10997 = _mm512_unpacklo_ps(wt435, wt436);
__m512 tmp10998 = _mm512_unpackhi_ps(wt435, wt436);
__m512 tmp10999 = _mm512_unpacklo_ps(wt437, wt438);
__m512 tmp11000 = _mm512_unpackhi_ps(wt437, wt438);
__m512 tmp11001 = _mm512_shuffle_ps(tmp10985, tmp10987, 68);
__m512 tmp11002 = _mm512_shuffle_ps(tmp10985, tmp10987, 238);
__m512 tmp11003 = _mm512_shuffle_ps(tmp10986, tmp10988, 68);
__m512 tmp11004 = _mm512_shuffle_ps(tmp10986, tmp10988, 238);
__m512 tmp11005 = _mm512_shuffle_ps(tmp10989, tmp10991, 68);
__m512 tmp11006 = _mm512_shuffle_ps(tmp10989, tmp10991, 238);
__m512 tmp11007 = _mm512_shuffle_ps(tmp10990, tmp10992, 68);
__m512 tmp11008 = _mm512_shuffle_ps(tmp10990, tmp10992, 238);
__m512 tmp11009 = _mm512_shuffle_ps(tmp10993, tmp10995, 68);
__m512 tmp11010 = _mm512_shuffle_ps(tmp10993, tmp10995, 238);
__m512 tmp11011 = _mm512_shuffle_ps(tmp10994, tmp10996, 68);
__m512 tmp11012 = _mm512_shuffle_ps(tmp10994, tmp10996, 238);
__m512 tmp11013 = _mm512_shuffle_ps(tmp10997, tmp10999, 68);
__m512 tmp11014 = _mm512_shuffle_ps(tmp10997, tmp10999, 238);
__m512 tmp11015 = _mm512_shuffle_ps(tmp10998, tmp11000, 68);
__m512 tmp11016 = _mm512_shuffle_ps(tmp10998, tmp11000, 238);
__m512 tmp11017 = _mm512_shuffle_f32x4(tmp11001, tmp11005, 136);
__m512 tmp11018 = _mm512_shuffle_f32x4(tmp11001, tmp11005, 221);
__m512 tmp11019 = _mm512_shuffle_f32x4(tmp11002, tmp11006, 136);
__m512 tmp11020 = _mm512_shuffle_f32x4(tmp11002, tmp11006, 221);
__m512 tmp11021 = _mm512_shuffle_f32x4(tmp11003, tmp11007, 136);
__m512 tmp11022 = _mm512_shuffle_f32x4(tmp11003, tmp11007, 221);
__m512 tmp11023 = _mm512_shuffle_f32x4(tmp11004, tmp11008, 136);
__m512 tmp11024 = _mm512_shuffle_f32x4(tmp11004, tmp11008, 221);
__m512 tmp11025 = _mm512_shuffle_f32x4(tmp11009, tmp11013, 136);
__m512 tmp11026 = _mm512_shuffle_f32x4(tmp11009, tmp11013, 221);
__m512 tmp11027 = _mm512_shuffle_f32x4(tmp11010, tmp11014, 136);
__m512 tmp11028 = _mm512_shuffle_f32x4(tmp11010, tmp11014, 221);
__m512 tmp11029 = _mm512_shuffle_f32x4(tmp11011, tmp11015, 136);
__m512 tmp11030 = _mm512_shuffle_f32x4(tmp11011, tmp11015, 221);
__m512 tmp11031 = _mm512_shuffle_f32x4(tmp11012, tmp11016, 136);
__m512 tmp11032 = _mm512_shuffle_f32x4(tmp11012, tmp11016, 221);
wt423 = _mm512_shuffle_f32x4(tmp11017, tmp11025, 136);
wt431 = _mm512_shuffle_f32x4(tmp11017, tmp11025, 221);
wt424 = _mm512_shuffle_f32x4(tmp11019, tmp11027, 136);
wt432 = _mm512_shuffle_f32x4(tmp11019, tmp11027, 221);
wt425 = _mm512_shuffle_f32x4(tmp11021, tmp11029, 136);
wt433 = _mm512_shuffle_f32x4(tmp11021, tmp11029, 221);
wt426 = _mm512_shuffle_f32x4(tmp11023, tmp11031, 136);
wt434 = _mm512_shuffle_f32x4(tmp11023, tmp11031, 221);
wt427 = _mm512_shuffle_f32x4(tmp11018, tmp11026, 136);
wt435 = _mm512_shuffle_f32x4(tmp11018, tmp11026, 221);
wt428 = _mm512_shuffle_f32x4(tmp11020, tmp11028, 136);
wt436 = _mm512_shuffle_f32x4(tmp11020, tmp11028, 221);
wt429 = _mm512_shuffle_f32x4(tmp11022, tmp11030, 136);
wt437 = _mm512_shuffle_f32x4(tmp11022, tmp11030, 221);
wt430 = _mm512_shuffle_f32x4(tmp11024, tmp11032, 136);
wt438 = _mm512_shuffle_f32x4(tmp11024, tmp11032, 221);
wt423 = _mm512_mul_ps(wt423, postMul38);
wt424 = _mm512_mul_ps(wt424, postMul38);
wt425 = _mm512_mul_ps(wt425, postMul38);
wt426 = _mm512_mul_ps(wt426, postMul38);
wt427 = _mm512_mul_ps(wt427, postMul38);
wt428 = _mm512_mul_ps(wt428, postMul38);
wt429 = _mm512_mul_ps(wt429, postMul38);
wt430 = _mm512_mul_ps(wt430, postMul38);
wt431 = _mm512_mul_ps(wt431, postMul38);
wt432 = _mm512_mul_ps(wt432, postMul38);
wt433 = _mm512_mul_ps(wt433, postMul38);
wt434 = _mm512_mul_ps(wt434, postMul38);
wt435 = _mm512_mul_ps(wt435, postMul38);
wt436 = _mm512_mul_ps(wt436, postMul38);
wt437 = _mm512_mul_ps(wt437, postMul38);
wt438 = _mm512_mul_ps(wt438, postMul38);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(1+16*c37)+(ptrdiff_t)0, 63>>cut16, wt423);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(2+16*c37)+(ptrdiff_t)0, 63>>cut16, wt424);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(3+16*c37)+(ptrdiff_t)0, 63>>cut16, wt425);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(4+16*c37)+(ptrdiff_t)0, 63>>cut16, wt426);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(5+16*c37)+(ptrdiff_t)0, 63>>cut16, wt427);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(6+16*c37)+(ptrdiff_t)0, 63>>cut16, wt428);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(7+16*c37)+(ptrdiff_t)0, 63>>cut16, wt429);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(8+16*c37)+(ptrdiff_t)0, 63>>cut16, wt430);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(9+16*c37)+(ptrdiff_t)0, 63>>cut16, wt431);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(10+16*c37)+(ptrdiff_t)0, 63>>cut16, wt432);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(11+16*c37)+(ptrdiff_t)0, 63>>cut16, wt433);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(12+16*c37)+(ptrdiff_t)0, 63>>cut16, wt434);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(13+16*c37)+(ptrdiff_t)0, 63>>cut16, wt435);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(14+16*c37)+(ptrdiff_t)0, 63>>cut16, wt436);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(15+16*c37)+(ptrdiff_t)0, 63>>cut16, wt437);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(16+16*c37)+(ptrdiff_t)0, 63>>cut16, wt438);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(1+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt423);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(2+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt424);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(3+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt425);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(4+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt426);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(5+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt427);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(6+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt428);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(7+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt429);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(8+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt430);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(9+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt431);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(10+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt432);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(11+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt433);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(12+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt434);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(13+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt435);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(14+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt436);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(15+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt437);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(16+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt438);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(1+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt423);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(2+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt424);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(3+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt425);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(4+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt426);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(5+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt427);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(6+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt428);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(7+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt429);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(8+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt430);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(9+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt431);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(10+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt432);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(11+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt433);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(12+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt434);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(13+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt435);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(14+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt436);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(15+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt437);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(16+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt438);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(1+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt423);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(2+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt424);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(3+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt425);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(4+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt426);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(5+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt427);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(6+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt428);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(7+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt429);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(8+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt430);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(9+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt431);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(10+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt432);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(11+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt433);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(12+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt434);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(13+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt435);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(14+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt436);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(15+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt437);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(16+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt438);
}
}
}
} else {
ptrdiff_t k123 = 240;
ptrdiff_t l51 = (size_t)(0+k123)/6;
ptrdiff_t cut15 = (size_t)(0+k123)%6;
__m512 sum309 = _mm512_maskz_loadu_ps(65535, biasPtr12+1024*i45+4*k123);
__m512i pmMul26 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd26 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo22 = _mm512_loadu_ps(bnPtr13+(ptrdiff_t)8*(k123+256*i45));
__m512 masHi22 = _mm512_maskz_loadu_ps(65535, bnPtr13+(ptrdiff_t)8*(k123+256*i45)+(ptrdiff_t)64);
__m512 postMul36 = _mm512_permutex2var_ps(masLo22, pmMul26, masHi22);
__m512 postAdd24 = _mm512_permutex2var_ps(masLo22, pmAdd26, masHi22);
sum309 = _mm512_fmadd_ps(sum309, postMul36, postAdd24);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*0+(ptrdiff_t)0, 63>>cut15, sum309);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*0+(ptrdiff_t)12288, 4032>>cut15, sum309);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*0+(ptrdiff_t)24576, 65535-(4095>>cut15), sum309);
ptrdiff_t c35 = 0;
for (; c35 != 32; ++c35) {
__m512 wt391 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)0);
__m512 wt392 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)2048);
__m512 wt393 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)4096);
__m512 wt394 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)6144);
__m512 wt395 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)8192);
__m512 wt396 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)10240);
__m512 wt397 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)12288);
__m512 wt398 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)14336);
__m512 wt399 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)16384);
__m512 wt400 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)18432);
__m512 wt401 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)20480);
__m512 wt402 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)22528);
__m512 wt403 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)24576);
__m512 wt404 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)26624);
__m512 wt405 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)28672);
__m512 wt406 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)30720);
__m512 tmp11033 = _mm512_unpacklo_ps(wt391, wt392);
__m512 tmp11034 = _mm512_unpackhi_ps(wt391, wt392);
__m512 tmp11035 = _mm512_unpacklo_ps(wt393, wt394);
__m512 tmp11036 = _mm512_unpackhi_ps(wt393, wt394);
__m512 tmp11037 = _mm512_unpacklo_ps(wt395, wt396);
__m512 tmp11038 = _mm512_unpackhi_ps(wt395, wt396);
__m512 tmp11039 = _mm512_unpacklo_ps(wt397, wt398);
__m512 tmp11040 = _mm512_unpackhi_ps(wt397, wt398);
__m512 tmp11041 = _mm512_unpacklo_ps(wt399, wt400);
__m512 tmp11042 = _mm512_unpackhi_ps(wt399, wt400);
__m512 tmp11043 = _mm512_unpacklo_ps(wt401, wt402);
__m512 tmp11044 = _mm512_unpackhi_ps(wt401, wt402);
__m512 tmp11045 = _mm512_unpacklo_ps(wt403, wt404);
__m512 tmp11046 = _mm512_unpackhi_ps(wt403, wt404);
__m512 tmp11047 = _mm512_unpacklo_ps(wt405, wt406);
__m512 tmp11048 = _mm512_unpackhi_ps(wt405, wt406);
__m512 tmp11049 = _mm512_shuffle_ps(tmp11033, tmp11035, 68);
__m512 tmp11050 = _mm512_shuffle_ps(tmp11033, tmp11035, 238);
__m512 tmp11051 = _mm512_shuffle_ps(tmp11034, tmp11036, 68);
__m512 tmp11052 = _mm512_shuffle_ps(tmp11034, tmp11036, 238);
__m512 tmp11053 = _mm512_shuffle_ps(tmp11037, tmp11039, 68);
__m512 tmp11054 = _mm512_shuffle_ps(tmp11037, tmp11039, 238);
__m512 tmp11055 = _mm512_shuffle_ps(tmp11038, tmp11040, 68);
__m512 tmp11056 = _mm512_shuffle_ps(tmp11038, tmp11040, 238);
__m512 tmp11057 = _mm512_shuffle_ps(tmp11041, tmp11043, 68);
__m512 tmp11058 = _mm512_shuffle_ps(tmp11041, tmp11043, 238);
__m512 tmp11059 = _mm512_shuffle_ps(tmp11042, tmp11044, 68);
__m512 tmp11060 = _mm512_shuffle_ps(tmp11042, tmp11044, 238);
__m512 tmp11061 = _mm512_shuffle_ps(tmp11045, tmp11047, 68);
__m512 tmp11062 = _mm512_shuffle_ps(tmp11045, tmp11047, 238);
__m512 tmp11063 = _mm512_shuffle_ps(tmp11046, tmp11048, 68);
__m512 tmp11064 = _mm512_shuffle_ps(tmp11046, tmp11048, 238);
__m512 tmp11065 = _mm512_shuffle_f32x4(tmp11049, tmp11053, 136);
__m512 tmp11066 = _mm512_shuffle_f32x4(tmp11049, tmp11053, 221);
__m512 tmp11067 = _mm512_shuffle_f32x4(tmp11050, tmp11054, 136);
__m512 tmp11068 = _mm512_shuffle_f32x4(tmp11050, tmp11054, 221);
__m512 tmp11069 = _mm512_shuffle_f32x4(tmp11051, tmp11055, 136);
__m512 tmp11070 = _mm512_shuffle_f32x4(tmp11051, tmp11055, 221);
__m512 tmp11071 = _mm512_shuffle_f32x4(tmp11052, tmp11056, 136);
__m512 tmp11072 = _mm512_shuffle_f32x4(tmp11052, tmp11056, 221);
__m512 tmp11073 = _mm512_shuffle_f32x4(tmp11057, tmp11061, 136);
__m512 tmp11074 = _mm512_shuffle_f32x4(tmp11057, tmp11061, 221);
__m512 tmp11075 = _mm512_shuffle_f32x4(tmp11058, tmp11062, 136);
__m512 tmp11076 = _mm512_shuffle_f32x4(tmp11058, tmp11062, 221);
__m512 tmp11077 = _mm512_shuffle_f32x4(tmp11059, tmp11063, 136);
__m512 tmp11078 = _mm512_shuffle_f32x4(tmp11059, tmp11063, 221);
__m512 tmp11079 = _mm512_shuffle_f32x4(tmp11060, tmp11064, 136);
__m512 tmp11080 = _mm512_shuffle_f32x4(tmp11060, tmp11064, 221);
wt391 = _mm512_shuffle_f32x4(tmp11065, tmp11073, 136);
wt399 = _mm512_shuffle_f32x4(tmp11065, tmp11073, 221);
wt392 = _mm512_shuffle_f32x4(tmp11067, tmp11075, 136);
wt400 = _mm512_shuffle_f32x4(tmp11067, tmp11075, 221);
wt393 = _mm512_shuffle_f32x4(tmp11069, tmp11077, 136);
wt401 = _mm512_shuffle_f32x4(tmp11069, tmp11077, 221);
wt394 = _mm512_shuffle_f32x4(tmp11071, tmp11079, 136);
wt402 = _mm512_shuffle_f32x4(tmp11071, tmp11079, 221);
wt395 = _mm512_shuffle_f32x4(tmp11066, tmp11074, 136);
wt403 = _mm512_shuffle_f32x4(tmp11066, tmp11074, 221);
wt396 = _mm512_shuffle_f32x4(tmp11068, tmp11076, 136);
wt404 = _mm512_shuffle_f32x4(tmp11068, tmp11076, 221);
wt397 = _mm512_shuffle_f32x4(tmp11070, tmp11078, 136);
wt405 = _mm512_shuffle_f32x4(tmp11070, tmp11078, 221);
wt398 = _mm512_shuffle_f32x4(tmp11072, tmp11080, 136);
wt406 = _mm512_shuffle_f32x4(tmp11072, tmp11080, 221);
wt391 = _mm512_mul_ps(wt391, postMul36);
wt392 = _mm512_mul_ps(wt392, postMul36);
wt393 = _mm512_mul_ps(wt393, postMul36);
wt394 = _mm512_mul_ps(wt394, postMul36);
wt395 = _mm512_mul_ps(wt395, postMul36);
wt396 = _mm512_mul_ps(wt396, postMul36);
wt397 = _mm512_mul_ps(wt397, postMul36);
wt398 = _mm512_mul_ps(wt398, postMul36);
wt399 = _mm512_mul_ps(wt399, postMul36);
wt400 = _mm512_mul_ps(wt400, postMul36);
wt401 = _mm512_mul_ps(wt401, postMul36);
wt402 = _mm512_mul_ps(wt402, postMul36);
wt403 = _mm512_mul_ps(wt403, postMul36);
wt404 = _mm512_mul_ps(wt404, postMul36);
wt405 = _mm512_mul_ps(wt405, postMul36);
wt406 = _mm512_mul_ps(wt406, postMul36);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(1+16*c35)+(ptrdiff_t)0, 63>>cut15, wt391);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(2+16*c35)+(ptrdiff_t)0, 63>>cut15, wt392);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(3+16*c35)+(ptrdiff_t)0, 63>>cut15, wt393);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(4+16*c35)+(ptrdiff_t)0, 63>>cut15, wt394);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(5+16*c35)+(ptrdiff_t)0, 63>>cut15, wt395);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(6+16*c35)+(ptrdiff_t)0, 63>>cut15, wt396);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(7+16*c35)+(ptrdiff_t)0, 63>>cut15, wt397);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(8+16*c35)+(ptrdiff_t)0, 63>>cut15, wt398);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(9+16*c35)+(ptrdiff_t)0, 63>>cut15, wt399);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(10+16*c35)+(ptrdiff_t)0, 63>>cut15, wt400);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(11+16*c35)+(ptrdiff_t)0, 63>>cut15, wt401);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(12+16*c35)+(ptrdiff_t)0, 63>>cut15, wt402);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(13+16*c35)+(ptrdiff_t)0, 63>>cut15, wt403);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(14+16*c35)+(ptrdiff_t)0, 63>>cut15, wt404);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(15+16*c35)+(ptrdiff_t)0, 63>>cut15, wt405);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(16+16*c35)+(ptrdiff_t)0, 63>>cut15, wt406);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(1+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt391);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(2+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt392);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(3+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt393);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(4+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt394);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(5+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt395);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(6+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt396);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(7+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt397);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(8+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt398);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(9+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt399);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(10+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt400);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(11+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt401);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(12+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt402);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(13+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt403);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(14+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt404);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(15+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt405);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(16+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt406);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(1+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt391);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(2+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt392);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(3+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt393);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(4+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt394);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(5+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt395);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(6+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt396);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(7+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt397);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(8+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt398);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(9+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt399);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(10+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt400);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(11+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt401);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(12+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt402);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(13+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt403);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(14+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt404);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(15+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt405);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(16+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt406);
}
}
}
}
}

static void ResNeXt50OneArrangeWts7(ResNeXt50ThreaderTeam1* team49, char** tensors71) {
ResNeXt50ThreaderTask1 task75;
task75.callee1 = ResNeXt50OneArrangeWts7Callee1;
task75.any1 = tensors71;
task75.nd1 = 3;
task75.hull1[0] = 16;
task75.hull1[1] = 1;
task75.hull1[2] = 1;
ResNeXt50ThreaderDo1(team49, &task75);
}

static void ResNeXt50OneArrangeDats7Callee1(ResNeXt50ThreaderTask1* task76, int64_t* pt43) {
char** tensors74 = task76->any1;
ptrdiff_t s40 = pt43[0];
ptrdiff_t c38 = pt43[1];
char*restrict datPtr23 = tensors74[0]+(ptrdiff_t)0+(ptrdiff_t)2618560*0+(ptrdiff_t)1605632*0;
char*restrict arranged14 = tensors74[1]+(ptrdiff_t)2618560*0+(ptrdiff_t)1605632*0;
ptrdiff_t ii32 = 1;
for (ptrdiff_t i46 = 0; i46 < ii32; ++i46) {
ptrdiff_t j39 = 1*c38;
ptrdiff_t jj42 = j39+0;
for (; j39 != 12; ++j39) {
ptrdiff_t k125 = 128*s40;
ptrdiff_t kk41 = k125+128;
for (; k125 < kk41; ++k125) {
__m512 dat2018 = _mm512_maskz_loadu_ps(65535, datPtr23+1605632*i46+256*j39+3136*k125+(ptrdiff_t)0);
__m512 dat2019 = _mm512_maskz_loadu_ps(65535, datPtr23+1605632*i46+256*j39+3136*k125+(ptrdiff_t)64);
__m512 dat2020 = _mm512_maskz_loadu_ps(65535, datPtr23+1605632*i46+256*j39+3136*k125+(ptrdiff_t)128);
__m512 dat2021 = _mm512_maskz_loadu_ps(65535, datPtr23+1605632*i46+256*j39+3136*k125+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged14+1605632*i46+131072*j39+256*k125+(ptrdiff_t)0, 65535, dat2018);
_mm512_mask_storeu_ps(arranged14+1605632*i46+131072*j39+256*k125+(ptrdiff_t)64, 65535, dat2019);
_mm512_mask_storeu_ps(arranged14+1605632*i46+131072*j39+256*k125+(ptrdiff_t)128, 65535, dat2020);
_mm512_mask_storeu_ps(arranged14+1605632*i46+131072*j39+256*k125+(ptrdiff_t)192, 65535, dat2021);
}
if (j39 >= jj42) goto next7;
}
ptrdiff_t k126 = 128*s40;
ptrdiff_t kk42 = k126+128;
for (; k126 < kk42; ++k126) {
__m512 dat2022 = _mm512_maskz_loadu_ps(65535, datPtr23+1605632*i46+256*j39+3136*k126+(ptrdiff_t)0);
_mm512_mask_storeu_ps(arranged14+1605632*i46+131072*j39+64*k126+(ptrdiff_t)0, 65535, dat2022);
}
next7:;
}
}

static void ResNeXt50OneArrangeDats7(ResNeXt50ThreaderTeam1* team50, char** tensors73) {
ResNeXt50ThreaderTask1 task77;
task77.callee1 = ResNeXt50OneArrangeDats7Callee1;
task77.any1 = tensors73;
task77.nd1 = 4;
task77.hull1[0] = 4;
task77.hull1[1] = 13;
task77.hull1[2] = 1;
task77.hull1[3] = 1;
ResNeXt50ThreaderDo1(team50, &task77);
}

static void ResNeXt50OneApply7Callee1(ResNeXt50ThreaderTask1* task78, int64_t* pt44) {
void** pair18 = task78->any1;
char** tensors76 = pair18[0];
ptrdiff_t e22 = 0;
ptrdiff_t g24 = 0;
ptrdiff_t d15 = pt44[1];
ptrdiff_t w55 = pt44[0];
char*restrict arrangedWts7 = tensors76[0]+856064*e22+(ptrdiff_t)525312*1*g24;
char*restrict arrangedDats7 = tensors76[1]+2618560*e22+(ptrdiff_t)1605632*1*g24;
char*restrict datPtr24 = tensors76[2]+(ptrdiff_t)802816*1*g24;
ptrdiff_t ii33 = 1;
for (ptrdiff_t i47 = 0; i47 < ii33; ++i47) {
ptrdiff_t j40 = 1*d15;
ptrdiff_t jj43 = j40+0;
for (; j40 != 12; ++j40) {
ptrdiff_t k127 = 1*w55;
ptrdiff_t kk43 = k127+0;
for (; k127 != 42; ++k127) {
ptrdiff_t s41 = -1;
__m512 sum312 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)24));
__m512 sum316 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)28));
__m512 sum320 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)32));
__m512 sum324 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)36));
__m512 sum328 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)40));
__m512 sum332 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)44));
__m512 sum313 = sum312;
__m512 sum314 = sum312;
__m512 sum315 = sum312;
__m512 sum317 = sum316;
__m512 sum318 = sum316;
__m512 sum319 = sum316;
__m512 sum321 = sum320;
__m512 sum322 = sum320;
__m512 sum323 = sum320;
__m512 sum325 = sum324;
__m512 sum326 = sum324;
__m512 sum327 = sum324;
__m512 sum329 = sum328;
__m512 sum330 = sum328;
__m512 sum331 = sum328;
__m512 sum333 = sum332;
__m512 sum334 = sum332;
__m512 sum335 = sum332;
for (s41 = 0; s41 < 512; ++s41) {
__m512 dat2023 = _mm512_loadu_ps(arrangedDats7+1605632*i47+131072*j40+256*s41+(ptrdiff_t)0);
__m512 dat2024 = _mm512_loadu_ps(arrangedDats7+1605632*i47+131072*j40+256*s41+(ptrdiff_t)64);
__m512 dat2025 = _mm512_loadu_ps(arrangedDats7+1605632*i47+131072*j40+256*s41+(ptrdiff_t)128);
__m512 dat2026 = _mm512_loadu_ps(arrangedDats7+1605632*i47+131072*j40+256*s41+(ptrdiff_t)192);
__m512 wt439 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)24));
sum312 = _mm512_fmadd_ps(wt439, dat2023, sum312);
sum313 = _mm512_fmadd_ps(wt439, dat2024, sum313);
sum314 = _mm512_fmadd_ps(wt439, dat2025, sum314);
sum315 = _mm512_fmadd_ps(wt439, dat2026, sum315);
__m512 wt440 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)28));
sum316 = _mm512_fmadd_ps(wt440, dat2023, sum316);
sum317 = _mm512_fmadd_ps(wt440, dat2024, sum317);
sum318 = _mm512_fmadd_ps(wt440, dat2025, sum318);
sum319 = _mm512_fmadd_ps(wt440, dat2026, sum319);
__m512 wt441 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)32));
sum320 = _mm512_fmadd_ps(wt441, dat2023, sum320);
sum321 = _mm512_fmadd_ps(wt441, dat2024, sum321);
sum322 = _mm512_fmadd_ps(wt441, dat2025, sum322);
sum323 = _mm512_fmadd_ps(wt441, dat2026, sum323);
__m512 wt442 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)36));
sum324 = _mm512_fmadd_ps(wt442, dat2023, sum324);
sum325 = _mm512_fmadd_ps(wt442, dat2024, sum325);
sum326 = _mm512_fmadd_ps(wt442, dat2025, sum326);
sum327 = _mm512_fmadd_ps(wt442, dat2026, sum327);
__m512 wt443 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)40));
sum328 = _mm512_fmadd_ps(wt443, dat2023, sum328);
sum329 = _mm512_fmadd_ps(wt443, dat2024, sum329);
sum330 = _mm512_fmadd_ps(wt443, dat2025, sum330);
sum331 = _mm512_fmadd_ps(wt443, dat2026, sum331);
__m512 wt444 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)44));
sum332 = _mm512_fmadd_ps(wt444, dat2023, sum332);
sum333 = _mm512_fmadd_ps(wt444, dat2024, sum333);
sum334 = _mm512_fmadd_ps(wt444, dat2025, sum334);
sum335 = _mm512_fmadd_ps(wt444, dat2026, sum335);
}
sum312 = _mm512_max_ps(_mm512_setzero_ps(), sum312);
sum313 = _mm512_max_ps(_mm512_setzero_ps(), sum313);
sum314 = _mm512_max_ps(_mm512_setzero_ps(), sum314);
sum315 = _mm512_max_ps(_mm512_setzero_ps(), sum315);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)0, 65535, sum312);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)64, 65535, sum313);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)128, 65535, sum314);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)192, 65535, sum315);
sum316 = _mm512_max_ps(_mm512_setzero_ps(), sum316);
sum317 = _mm512_max_ps(_mm512_setzero_ps(), sum317);
sum318 = _mm512_max_ps(_mm512_setzero_ps(), sum318);
sum319 = _mm512_max_ps(_mm512_setzero_ps(), sum319);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)3136, 65535, sum316);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)3200, 65535, sum317);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)3264, 65535, sum318);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)3328, 65535, sum319);
sum320 = _mm512_max_ps(_mm512_setzero_ps(), sum320);
sum321 = _mm512_max_ps(_mm512_setzero_ps(), sum321);
sum322 = _mm512_max_ps(_mm512_setzero_ps(), sum322);
sum323 = _mm512_max_ps(_mm512_setzero_ps(), sum323);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)6272, 65535, sum320);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)6336, 65535, sum321);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)6400, 65535, sum322);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)6464, 65535, sum323);
sum324 = _mm512_max_ps(_mm512_setzero_ps(), sum324);
sum325 = _mm512_max_ps(_mm512_setzero_ps(), sum325);
sum326 = _mm512_max_ps(_mm512_setzero_ps(), sum326);
sum327 = _mm512_max_ps(_mm512_setzero_ps(), sum327);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)9408, 65535, sum324);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)9472, 65535, sum325);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)9536, 65535, sum326);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)9600, 65535, sum327);
sum328 = _mm512_max_ps(_mm512_setzero_ps(), sum328);
sum329 = _mm512_max_ps(_mm512_setzero_ps(), sum329);
sum330 = _mm512_max_ps(_mm512_setzero_ps(), sum330);
sum331 = _mm512_max_ps(_mm512_setzero_ps(), sum331);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)12544, 65535, sum328);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)12608, 65535, sum329);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)12672, 65535, sum330);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)12736, 65535, sum331);
sum332 = _mm512_max_ps(_mm512_setzero_ps(), sum332);
sum333 = _mm512_max_ps(_mm512_setzero_ps(), sum333);
sum334 = _mm512_max_ps(_mm512_setzero_ps(), sum334);
sum335 = _mm512_max_ps(_mm512_setzero_ps(), sum335);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)15680, 65535, sum332);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)15744, 65535, sum333);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)15808, 65535, sum334);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)15872, 65535, sum335);
if (k127 >= kk43) return;
}
ptrdiff_t s42 = -1;
__m512 sum336 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+16*s42+(ptrdiff_t)16));
__m512 sum340 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+16*s42+(ptrdiff_t)20));
__m512 sum344 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+16*s42+(ptrdiff_t)24));
__m512 sum348 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+16*s42+(ptrdiff_t)28));
__m512 sum337 = sum336;
__m512 sum338 = sum336;
__m512 sum339 = sum336;
__m512 sum341 = sum340;
__m512 sum342 = sum340;
__m512 sum343 = sum340;
__m512 sum345 = sum344;
__m512 sum346 = sum344;
__m512 sum347 = sum344;
__m512 sum349 = sum348;
__m512 sum350 = sum348;
__m512 sum351 = sum348;
for (s42 = 0; s42 < 512; ++s42) {
__m512 dat2027 = _mm512_loadu_ps(arrangedDats7+1605632*i47+131072*j40+256*s42+(ptrdiff_t)0);
__m512 dat2028 = _mm512_loadu_ps(arrangedDats7+1605632*i47+131072*j40+256*s42+(ptrdiff_t)64);
__m512 dat2029 = _mm512_loadu_ps(arrangedDats7+1605632*i47+131072*j40+256*s42+(ptrdiff_t)128);
__m512 dat2030 = _mm512_loadu_ps(arrangedDats7+1605632*i47+131072*j40+256*s42+(ptrdiff_t)192);
__m512 wt445 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+16*s42+(ptrdiff_t)16));
sum336 = _mm512_fmadd_ps(wt445, dat2027, sum336);
sum337 = _mm512_fmadd_ps(wt445, dat2028, sum337);
sum338 = _mm512_fmadd_ps(wt445, dat2029, sum338);
sum339 = _mm512_fmadd_ps(wt445, dat2030, sum339);
__m512 wt446 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+16*s42+(ptrdiff_t)20));
sum340 = _mm512_fmadd_ps(wt446, dat2027, sum340);
sum341 = _mm512_fmadd_ps(wt446, dat2028, sum341);
sum342 = _mm512_fmadd_ps(wt446, dat2029, sum342);
sum343 = _mm512_fmadd_ps(wt446, dat2030, sum343);
__m512 wt447 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+16*s42+(ptrdiff_t)24));
sum344 = _mm512_fmadd_ps(wt447, dat2027, sum344);
sum345 = _mm512_fmadd_ps(wt447, dat2028, sum345);
sum346 = _mm512_fmadd_ps(wt447, dat2029, sum346);
sum347 = _mm512_fmadd_ps(wt447, dat2030, sum347);
__m512 wt448 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+16*s42+(ptrdiff_t)28));
sum348 = _mm512_fmadd_ps(wt448, dat2027, sum348);
sum349 = _mm512_fmadd_ps(wt448, dat2028, sum349);
sum350 = _mm512_fmadd_ps(wt448, dat2029, sum350);
sum351 = _mm512_fmadd_ps(wt448, dat2030, sum351);
}
sum336 = _mm512_max_ps(_mm512_setzero_ps(), sum336);
sum337 = _mm512_max_ps(_mm512_setzero_ps(), sum337);
sum338 = _mm512_max_ps(_mm512_setzero_ps(), sum338);
sum339 = _mm512_max_ps(_mm512_setzero_ps(), sum339);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)0, 65535, sum336);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)64, 65535, sum337);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)128, 65535, sum338);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)192, 65535, sum339);
sum340 = _mm512_max_ps(_mm512_setzero_ps(), sum340);
sum341 = _mm512_max_ps(_mm512_setzero_ps(), sum341);
sum342 = _mm512_max_ps(_mm512_setzero_ps(), sum342);
sum343 = _mm512_max_ps(_mm512_setzero_ps(), sum343);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)3136, 65535, sum340);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)3200, 65535, sum341);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)3264, 65535, sum342);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)3328, 65535, sum343);
sum344 = _mm512_max_ps(_mm512_setzero_ps(), sum344);
sum345 = _mm512_max_ps(_mm512_setzero_ps(), sum345);
sum346 = _mm512_max_ps(_mm512_setzero_ps(), sum346);
sum347 = _mm512_max_ps(_mm512_setzero_ps(), sum347);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)6272, 65535, sum344);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)6336, 65535, sum345);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)6400, 65535, sum346);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)6464, 65535, sum347);
sum348 = _mm512_max_ps(_mm512_setzero_ps(), sum348);
sum349 = _mm512_max_ps(_mm512_setzero_ps(), sum349);
sum350 = _mm512_max_ps(_mm512_setzero_ps(), sum350);
sum351 = _mm512_max_ps(_mm512_setzero_ps(), sum351);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)9408, 65535, sum348);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)9472, 65535, sum349);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)9536, 65535, sum350);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)9600, 65535, sum351);
if (j40 >= jj43) return;
}
ptrdiff_t k128 = 1*w55;
ptrdiff_t kk44 = k128+0;
for (; k128 != 42; ++k128) {
ptrdiff_t s43 = -1;
__m512 sum352 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)24));
__m512 sum353 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)28));
__m512 sum354 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)32));
__m512 sum355 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)36));
__m512 sum356 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)40));
__m512 sum357 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)44));
for (s43 = 0; s43 < 512; ++s43) {
__m512 dat2031 = _mm512_loadu_ps(arrangedDats7+1605632*i47+131072*j40+64*s43+(ptrdiff_t)0);
__m512 wt449 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)24));
sum352 = _mm512_fmadd_ps(wt449, dat2031, sum352);
__m512 wt450 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)28));
sum353 = _mm512_fmadd_ps(wt450, dat2031, sum353);
__m512 wt451 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)32));
sum354 = _mm512_fmadd_ps(wt451, dat2031, sum354);
__m512 wt452 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)36));
sum355 = _mm512_fmadd_ps(wt452, dat2031, sum355);
__m512 wt453 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)40));
sum356 = _mm512_fmadd_ps(wt453, dat2031, sum356);
__m512 wt454 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)44));
sum357 = _mm512_fmadd_ps(wt454, dat2031, sum357);
}
sum352 = _mm512_max_ps(_mm512_setzero_ps(), sum352);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k128+(ptrdiff_t)0, 65535, sum352);
sum353 = _mm512_max_ps(_mm512_setzero_ps(), sum353);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k128+(ptrdiff_t)3136, 65535, sum353);
sum354 = _mm512_max_ps(_mm512_setzero_ps(), sum354);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k128+(ptrdiff_t)6272, 65535, sum354);
sum355 = _mm512_max_ps(_mm512_setzero_ps(), sum355);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k128+(ptrdiff_t)9408, 65535, sum355);
sum356 = _mm512_max_ps(_mm512_setzero_ps(), sum356);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k128+(ptrdiff_t)12544, 65535, sum356);
sum357 = _mm512_max_ps(_mm512_setzero_ps(), sum357);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k128+(ptrdiff_t)15680, 65535, sum357);
if (k128 >= kk44) return;
}
ptrdiff_t s44 = -1;
__m512 sum358 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+16*s44+(ptrdiff_t)16));
__m512 sum359 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+16*s44+(ptrdiff_t)20));
__m512 sum360 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+16*s44+(ptrdiff_t)24));
__m512 sum361 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+16*s44+(ptrdiff_t)28));
for (s44 = 0; s44 < 512; ++s44) {
__m512 dat2032 = _mm512_loadu_ps(arrangedDats7+1605632*i47+131072*j40+64*s44+(ptrdiff_t)0);
__m512 wt455 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+16*s44+(ptrdiff_t)16));
sum358 = _mm512_fmadd_ps(wt455, dat2032, sum358);
__m512 wt456 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+16*s44+(ptrdiff_t)20));
sum359 = _mm512_fmadd_ps(wt456, dat2032, sum359);
__m512 wt457 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+16*s44+(ptrdiff_t)24));
sum360 = _mm512_fmadd_ps(wt457, dat2032, sum360);
__m512 wt458 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+16*s44+(ptrdiff_t)28));
sum361 = _mm512_fmadd_ps(wt458, dat2032, sum361);
}
sum358 = _mm512_max_ps(_mm512_setzero_ps(), sum358);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k128+(ptrdiff_t)0, 65535, sum358);
sum359 = _mm512_max_ps(_mm512_setzero_ps(), sum359);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k128+(ptrdiff_t)3136, 65535, sum359);
sum360 = _mm512_max_ps(_mm512_setzero_ps(), sum360);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k128+(ptrdiff_t)6272, 65535, sum360);
sum361 = _mm512_max_ps(_mm512_setzero_ps(), sum361);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k128+(ptrdiff_t)9408, 65535, sum361);
}
}

static void ResNeXt50OneApply7(ResNeXt50ThreaderTeam1* team51, char** tensors75) {
void* pair17[] = {tensors75, 0};
ResNeXt50ThreaderTask1 task79;
task79.callee1 = ResNeXt50OneApply7Callee1;
task79.any1 = pair17;
task79.nd1 = 3;
task79.hull1[0] = 43;
task79.hull1[1] = 13;
task79.hull1[2] = 1;
ResNeXt50ThreaderDo1(team51, &task79);
}

static void ResNeXt50OneArrangeWts8Callee1(ResNeXt50ThreaderTask1* task88, int64_t* pt49) {
char** tensors86 = task88->any1;
ptrdiff_t b74 = pt49[0];
char*restrict wtPtr14 = tensors86[0]+(ptrdiff_t)3340*0+(ptrdiff_t)2097152*0;
char*restrict biasPtr14 = tensors86[1]+(ptrdiff_t)4096*0;
char*restrict bnPtr15 = tensors86[2]+(ptrdiff_t)8*1024*0;
char*restrict arranged15 = tensors86[3]+(ptrdiff_t)3424256*0+(ptrdiff_t)2101248*0;
ptrdiff_t ii38 = 1;
for (ptrdiff_t i53 = 0; i53 < ii38; ++i53) {
ptrdiff_t j45 = 1*b74;
ptrdiff_t jj45 = j45+1;
for (; j45 < jj45; ++j45) {
if (j45 < 63) {
ptrdiff_t k142 = 0+16*(j45-0);
ptrdiff_t l61 = (size_t)(0+k142)/6;
ptrdiff_t cut19 = (size_t)(0+k142)%6;
switch (cut19) {
case 0:;
case 2: {
__m512 sum391 = _mm512_maskz_loadu_ps(65535, biasPtr14+4096*i53+4*k142);
__m512i pmMul28 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd28 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo23 = _mm512_loadu_ps(bnPtr15+(ptrdiff_t)8*(k142+1024*i53));
__m512 masHi23 = _mm512_maskz_loadu_ps(65535, bnPtr15+(ptrdiff_t)8*(k142+1024*i53)+(ptrdiff_t)64);
__m512 postMul45 = _mm512_permutex2var_ps(masLo23, pmMul28, masHi23);
__m512 postAdd29 = _mm512_permutex2var_ps(masLo23, pmAdd28, masHi23);
sum391 = _mm512_fmadd_ps(sum391, postMul45, postAdd29);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*0+(ptrdiff_t)0, 63>>cut19, sum391);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*0+(ptrdiff_t)12288, 4032>>cut19, sum391);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*0+(ptrdiff_t)24576, 65535-(4095>>cut19), sum391);
ptrdiff_t c41 = 0;
for (; c41 != 32; ++c41) {
__m512 wt479 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)0);
__m512 wt480 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)2048);
__m512 wt481 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)4096);
__m512 wt482 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)6144);
__m512 wt483 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)8192);
__m512 wt484 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)10240);
__m512 wt485 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)12288);
__m512 wt486 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)14336);
__m512 wt487 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)16384);
__m512 wt488 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)18432);
__m512 wt489 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)20480);
__m512 wt490 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)22528);
__m512 wt491 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)24576);
__m512 wt492 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)26624);
__m512 wt493 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)28672);
__m512 wt494 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)30720);
__m512 tmp13699 = _mm512_unpacklo_ps(wt479, wt480);
__m512 tmp13700 = _mm512_unpackhi_ps(wt479, wt480);
__m512 tmp13701 = _mm512_unpacklo_ps(wt481, wt482);
__m512 tmp13702 = _mm512_unpackhi_ps(wt481, wt482);
__m512 tmp13703 = _mm512_unpacklo_ps(wt483, wt484);
__m512 tmp13704 = _mm512_unpackhi_ps(wt483, wt484);
__m512 tmp13705 = _mm512_unpacklo_ps(wt485, wt486);
__m512 tmp13706 = _mm512_unpackhi_ps(wt485, wt486);
__m512 tmp13707 = _mm512_unpacklo_ps(wt487, wt488);
__m512 tmp13708 = _mm512_unpackhi_ps(wt487, wt488);
__m512 tmp13709 = _mm512_unpacklo_ps(wt489, wt490);
__m512 tmp13710 = _mm512_unpackhi_ps(wt489, wt490);
__m512 tmp13711 = _mm512_unpacklo_ps(wt491, wt492);
__m512 tmp13712 = _mm512_unpackhi_ps(wt491, wt492);
__m512 tmp13713 = _mm512_unpacklo_ps(wt493, wt494);
__m512 tmp13714 = _mm512_unpackhi_ps(wt493, wt494);
__m512 tmp13715 = _mm512_shuffle_ps(tmp13699, tmp13701, 68);
__m512 tmp13716 = _mm512_shuffle_ps(tmp13699, tmp13701, 238);
__m512 tmp13717 = _mm512_shuffle_ps(tmp13700, tmp13702, 68);
__m512 tmp13718 = _mm512_shuffle_ps(tmp13700, tmp13702, 238);
__m512 tmp13719 = _mm512_shuffle_ps(tmp13703, tmp13705, 68);
__m512 tmp13720 = _mm512_shuffle_ps(tmp13703, tmp13705, 238);
__m512 tmp13721 = _mm512_shuffle_ps(tmp13704, tmp13706, 68);
__m512 tmp13722 = _mm512_shuffle_ps(tmp13704, tmp13706, 238);
__m512 tmp13723 = _mm512_shuffle_ps(tmp13707, tmp13709, 68);
__m512 tmp13724 = _mm512_shuffle_ps(tmp13707, tmp13709, 238);
__m512 tmp13725 = _mm512_shuffle_ps(tmp13708, tmp13710, 68);
__m512 tmp13726 = _mm512_shuffle_ps(tmp13708, tmp13710, 238);
__m512 tmp13727 = _mm512_shuffle_ps(tmp13711, tmp13713, 68);
__m512 tmp13728 = _mm512_shuffle_ps(tmp13711, tmp13713, 238);
__m512 tmp13729 = _mm512_shuffle_ps(tmp13712, tmp13714, 68);
__m512 tmp13730 = _mm512_shuffle_ps(tmp13712, tmp13714, 238);
__m512 tmp13731 = _mm512_shuffle_f32x4(tmp13715, tmp13719, 136);
__m512 tmp13732 = _mm512_shuffle_f32x4(tmp13715, tmp13719, 221);
__m512 tmp13733 = _mm512_shuffle_f32x4(tmp13716, tmp13720, 136);
__m512 tmp13734 = _mm512_shuffle_f32x4(tmp13716, tmp13720, 221);
__m512 tmp13735 = _mm512_shuffle_f32x4(tmp13717, tmp13721, 136);
__m512 tmp13736 = _mm512_shuffle_f32x4(tmp13717, tmp13721, 221);
__m512 tmp13737 = _mm512_shuffle_f32x4(tmp13718, tmp13722, 136);
__m512 tmp13738 = _mm512_shuffle_f32x4(tmp13718, tmp13722, 221);
__m512 tmp13739 = _mm512_shuffle_f32x4(tmp13723, tmp13727, 136);
__m512 tmp13740 = _mm512_shuffle_f32x4(tmp13723, tmp13727, 221);
__m512 tmp13741 = _mm512_shuffle_f32x4(tmp13724, tmp13728, 136);
__m512 tmp13742 = _mm512_shuffle_f32x4(tmp13724, tmp13728, 221);
__m512 tmp13743 = _mm512_shuffle_f32x4(tmp13725, tmp13729, 136);
__m512 tmp13744 = _mm512_shuffle_f32x4(tmp13725, tmp13729, 221);
__m512 tmp13745 = _mm512_shuffle_f32x4(tmp13726, tmp13730, 136);
__m512 tmp13746 = _mm512_shuffle_f32x4(tmp13726, tmp13730, 221);
wt479 = _mm512_shuffle_f32x4(tmp13731, tmp13739, 136);
wt487 = _mm512_shuffle_f32x4(tmp13731, tmp13739, 221);
wt480 = _mm512_shuffle_f32x4(tmp13733, tmp13741, 136);
wt488 = _mm512_shuffle_f32x4(tmp13733, tmp13741, 221);
wt481 = _mm512_shuffle_f32x4(tmp13735, tmp13743, 136);
wt489 = _mm512_shuffle_f32x4(tmp13735, tmp13743, 221);
wt482 = _mm512_shuffle_f32x4(tmp13737, tmp13745, 136);
wt490 = _mm512_shuffle_f32x4(tmp13737, tmp13745, 221);
wt483 = _mm512_shuffle_f32x4(tmp13732, tmp13740, 136);
wt491 = _mm512_shuffle_f32x4(tmp13732, tmp13740, 221);
wt484 = _mm512_shuffle_f32x4(tmp13734, tmp13742, 136);
wt492 = _mm512_shuffle_f32x4(tmp13734, tmp13742, 221);
wt485 = _mm512_shuffle_f32x4(tmp13736, tmp13744, 136);
wt493 = _mm512_shuffle_f32x4(tmp13736, tmp13744, 221);
wt486 = _mm512_shuffle_f32x4(tmp13738, tmp13746, 136);
wt494 = _mm512_shuffle_f32x4(tmp13738, tmp13746, 221);
wt479 = _mm512_mul_ps(wt479, postMul45);
wt480 = _mm512_mul_ps(wt480, postMul45);
wt481 = _mm512_mul_ps(wt481, postMul45);
wt482 = _mm512_mul_ps(wt482, postMul45);
wt483 = _mm512_mul_ps(wt483, postMul45);
wt484 = _mm512_mul_ps(wt484, postMul45);
wt485 = _mm512_mul_ps(wt485, postMul45);
wt486 = _mm512_mul_ps(wt486, postMul45);
wt487 = _mm512_mul_ps(wt487, postMul45);
wt488 = _mm512_mul_ps(wt488, postMul45);
wt489 = _mm512_mul_ps(wt489, postMul45);
wt490 = _mm512_mul_ps(wt490, postMul45);
wt491 = _mm512_mul_ps(wt491, postMul45);
wt492 = _mm512_mul_ps(wt492, postMul45);
wt493 = _mm512_mul_ps(wt493, postMul45);
wt494 = _mm512_mul_ps(wt494, postMul45);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(1+16*c41)+(ptrdiff_t)0, 63>>cut19, wt479);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(2+16*c41)+(ptrdiff_t)0, 63>>cut19, wt480);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(3+16*c41)+(ptrdiff_t)0, 63>>cut19, wt481);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(4+16*c41)+(ptrdiff_t)0, 63>>cut19, wt482);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(5+16*c41)+(ptrdiff_t)0, 63>>cut19, wt483);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(6+16*c41)+(ptrdiff_t)0, 63>>cut19, wt484);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(7+16*c41)+(ptrdiff_t)0, 63>>cut19, wt485);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(8+16*c41)+(ptrdiff_t)0, 63>>cut19, wt486);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(9+16*c41)+(ptrdiff_t)0, 63>>cut19, wt487);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(10+16*c41)+(ptrdiff_t)0, 63>>cut19, wt488);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(11+16*c41)+(ptrdiff_t)0, 63>>cut19, wt489);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(12+16*c41)+(ptrdiff_t)0, 63>>cut19, wt490);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(13+16*c41)+(ptrdiff_t)0, 63>>cut19, wt491);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(14+16*c41)+(ptrdiff_t)0, 63>>cut19, wt492);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(15+16*c41)+(ptrdiff_t)0, 63>>cut19, wt493);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(16+16*c41)+(ptrdiff_t)0, 63>>cut19, wt494);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(1+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt479);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(2+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt480);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(3+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt481);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(4+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt482);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(5+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt483);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(6+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt484);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(7+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt485);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(8+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt486);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(9+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt487);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(10+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt488);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(11+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt489);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(12+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt490);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(13+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt491);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(14+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt492);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(15+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt493);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(16+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt494);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(1+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt479);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(2+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt480);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(3+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt481);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(4+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt482);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(5+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt483);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(6+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt484);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(7+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt485);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(8+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt486);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(9+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt487);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(10+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt488);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(11+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt489);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(12+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt490);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(13+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt491);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(14+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt492);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(15+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt493);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(16+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt494);
}
break;
}
default: {
cut19 = 4;
__m512 sum392 = _mm512_maskz_loadu_ps(65535, biasPtr14+4096*i53+4*k142);
__m512i pmMul29 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd29 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo24 = _mm512_loadu_ps(bnPtr15+(ptrdiff_t)8*(k142+1024*i53));
__m512 masHi24 = _mm512_maskz_loadu_ps(65535, bnPtr15+(ptrdiff_t)8*(k142+1024*i53)+(ptrdiff_t)64);
__m512 postMul46 = _mm512_permutex2var_ps(masLo24, pmMul29, masHi24);
__m512 postAdd30 = _mm512_permutex2var_ps(masLo24, pmAdd29, masHi24);
sum392 = _mm512_fmadd_ps(sum392, postMul46, postAdd30);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*0+(ptrdiff_t)0, 63>>cut19, sum392);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*0+(ptrdiff_t)12288, 4032>>cut19, sum392);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*0+(ptrdiff_t)24576, 258048>>cut19, sum392);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*0+(ptrdiff_t)36864, 65535-(262143>>cut19), sum392);
ptrdiff_t c42 = 0;
for (; c42 != 32; ++c42) {
__m512 wt495 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)0);
__m512 wt496 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)2048);
__m512 wt497 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)4096);
__m512 wt498 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)6144);
__m512 wt499 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)8192);
__m512 wt500 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)10240);
__m512 wt501 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)12288);
__m512 wt502 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)14336);
__m512 wt503 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)16384);
__m512 wt504 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)18432);
__m512 wt505 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)20480);
__m512 wt506 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)22528);
__m512 wt507 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)24576);
__m512 wt508 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)26624);
__m512 wt509 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)28672);
__m512 wt510 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)30720);
__m512 tmp13747 = _mm512_unpacklo_ps(wt495, wt496);
__m512 tmp13748 = _mm512_unpackhi_ps(wt495, wt496);
__m512 tmp13749 = _mm512_unpacklo_ps(wt497, wt498);
__m512 tmp13750 = _mm512_unpackhi_ps(wt497, wt498);
__m512 tmp13751 = _mm512_unpacklo_ps(wt499, wt500);
__m512 tmp13752 = _mm512_unpackhi_ps(wt499, wt500);
__m512 tmp13753 = _mm512_unpacklo_ps(wt501, wt502);
__m512 tmp13754 = _mm512_unpackhi_ps(wt501, wt502);
__m512 tmp13755 = _mm512_unpacklo_ps(wt503, wt504);
__m512 tmp13756 = _mm512_unpackhi_ps(wt503, wt504);
__m512 tmp13757 = _mm512_unpacklo_ps(wt505, wt506);
__m512 tmp13758 = _mm512_unpackhi_ps(wt505, wt506);
__m512 tmp13759 = _mm512_unpacklo_ps(wt507, wt508);
__m512 tmp13760 = _mm512_unpackhi_ps(wt507, wt508);
__m512 tmp13761 = _mm512_unpacklo_ps(wt509, wt510);
__m512 tmp13762 = _mm512_unpackhi_ps(wt509, wt510);
__m512 tmp13763 = _mm512_shuffle_ps(tmp13747, tmp13749, 68);
__m512 tmp13764 = _mm512_shuffle_ps(tmp13747, tmp13749, 238);
__m512 tmp13765 = _mm512_shuffle_ps(tmp13748, tmp13750, 68);
__m512 tmp13766 = _mm512_shuffle_ps(tmp13748, tmp13750, 238);
__m512 tmp13767 = _mm512_shuffle_ps(tmp13751, tmp13753, 68);
__m512 tmp13768 = _mm512_shuffle_ps(tmp13751, tmp13753, 238);
__m512 tmp13769 = _mm512_shuffle_ps(tmp13752, tmp13754, 68);
__m512 tmp13770 = _mm512_shuffle_ps(tmp13752, tmp13754, 238);
__m512 tmp13771 = _mm512_shuffle_ps(tmp13755, tmp13757, 68);
__m512 tmp13772 = _mm512_shuffle_ps(tmp13755, tmp13757, 238);
__m512 tmp13773 = _mm512_shuffle_ps(tmp13756, tmp13758, 68);
__m512 tmp13774 = _mm512_shuffle_ps(tmp13756, tmp13758, 238);
__m512 tmp13775 = _mm512_shuffle_ps(tmp13759, tmp13761, 68);
__m512 tmp13776 = _mm512_shuffle_ps(tmp13759, tmp13761, 238);
__m512 tmp13777 = _mm512_shuffle_ps(tmp13760, tmp13762, 68);
__m512 tmp13778 = _mm512_shuffle_ps(tmp13760, tmp13762, 238);
__m512 tmp13779 = _mm512_shuffle_f32x4(tmp13763, tmp13767, 136);
__m512 tmp13780 = _mm512_shuffle_f32x4(tmp13763, tmp13767, 221);
__m512 tmp13781 = _mm512_shuffle_f32x4(tmp13764, tmp13768, 136);
__m512 tmp13782 = _mm512_shuffle_f32x4(tmp13764, tmp13768, 221);
__m512 tmp13783 = _mm512_shuffle_f32x4(tmp13765, tmp13769, 136);
__m512 tmp13784 = _mm512_shuffle_f32x4(tmp13765, tmp13769, 221);
__m512 tmp13785 = _mm512_shuffle_f32x4(tmp13766, tmp13770, 136);
__m512 tmp13786 = _mm512_shuffle_f32x4(tmp13766, tmp13770, 221);
__m512 tmp13787 = _mm512_shuffle_f32x4(tmp13771, tmp13775, 136);
__m512 tmp13788 = _mm512_shuffle_f32x4(tmp13771, tmp13775, 221);
__m512 tmp13789 = _mm512_shuffle_f32x4(tmp13772, tmp13776, 136);
__m512 tmp13790 = _mm512_shuffle_f32x4(tmp13772, tmp13776, 221);
__m512 tmp13791 = _mm512_shuffle_f32x4(tmp13773, tmp13777, 136);
__m512 tmp13792 = _mm512_shuffle_f32x4(tmp13773, tmp13777, 221);
__m512 tmp13793 = _mm512_shuffle_f32x4(tmp13774, tmp13778, 136);
__m512 tmp13794 = _mm512_shuffle_f32x4(tmp13774, tmp13778, 221);
wt495 = _mm512_shuffle_f32x4(tmp13779, tmp13787, 136);
wt503 = _mm512_shuffle_f32x4(tmp13779, tmp13787, 221);
wt496 = _mm512_shuffle_f32x4(tmp13781, tmp13789, 136);
wt504 = _mm512_shuffle_f32x4(tmp13781, tmp13789, 221);
wt497 = _mm512_shuffle_f32x4(tmp13783, tmp13791, 136);
wt505 = _mm512_shuffle_f32x4(tmp13783, tmp13791, 221);
wt498 = _mm512_shuffle_f32x4(tmp13785, tmp13793, 136);
wt506 = _mm512_shuffle_f32x4(tmp13785, tmp13793, 221);
wt499 = _mm512_shuffle_f32x4(tmp13780, tmp13788, 136);
wt507 = _mm512_shuffle_f32x4(tmp13780, tmp13788, 221);
wt500 = _mm512_shuffle_f32x4(tmp13782, tmp13790, 136);
wt508 = _mm512_shuffle_f32x4(tmp13782, tmp13790, 221);
wt501 = _mm512_shuffle_f32x4(tmp13784, tmp13792, 136);
wt509 = _mm512_shuffle_f32x4(tmp13784, tmp13792, 221);
wt502 = _mm512_shuffle_f32x4(tmp13786, tmp13794, 136);
wt510 = _mm512_shuffle_f32x4(tmp13786, tmp13794, 221);
wt495 = _mm512_mul_ps(wt495, postMul46);
wt496 = _mm512_mul_ps(wt496, postMul46);
wt497 = _mm512_mul_ps(wt497, postMul46);
wt498 = _mm512_mul_ps(wt498, postMul46);
wt499 = _mm512_mul_ps(wt499, postMul46);
wt500 = _mm512_mul_ps(wt500, postMul46);
wt501 = _mm512_mul_ps(wt501, postMul46);
wt502 = _mm512_mul_ps(wt502, postMul46);
wt503 = _mm512_mul_ps(wt503, postMul46);
wt504 = _mm512_mul_ps(wt504, postMul46);
wt505 = _mm512_mul_ps(wt505, postMul46);
wt506 = _mm512_mul_ps(wt506, postMul46);
wt507 = _mm512_mul_ps(wt507, postMul46);
wt508 = _mm512_mul_ps(wt508, postMul46);
wt509 = _mm512_mul_ps(wt509, postMul46);
wt510 = _mm512_mul_ps(wt510, postMul46);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(1+16*c42)+(ptrdiff_t)0, 63>>cut19, wt495);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(2+16*c42)+(ptrdiff_t)0, 63>>cut19, wt496);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(3+16*c42)+(ptrdiff_t)0, 63>>cut19, wt497);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(4+16*c42)+(ptrdiff_t)0, 63>>cut19, wt498);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(5+16*c42)+(ptrdiff_t)0, 63>>cut19, wt499);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(6+16*c42)+(ptrdiff_t)0, 63>>cut19, wt500);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(7+16*c42)+(ptrdiff_t)0, 63>>cut19, wt501);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(8+16*c42)+(ptrdiff_t)0, 63>>cut19, wt502);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(9+16*c42)+(ptrdiff_t)0, 63>>cut19, wt503);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(10+16*c42)+(ptrdiff_t)0, 63>>cut19, wt504);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(11+16*c42)+(ptrdiff_t)0, 63>>cut19, wt505);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(12+16*c42)+(ptrdiff_t)0, 63>>cut19, wt506);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(13+16*c42)+(ptrdiff_t)0, 63>>cut19, wt507);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(14+16*c42)+(ptrdiff_t)0, 63>>cut19, wt508);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(15+16*c42)+(ptrdiff_t)0, 63>>cut19, wt509);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(16+16*c42)+(ptrdiff_t)0, 63>>cut19, wt510);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(1+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt495);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(2+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt496);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(3+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt497);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(4+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt498);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(5+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt499);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(6+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt500);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(7+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt501);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(8+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt502);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(9+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt503);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(10+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt504);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(11+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt505);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(12+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt506);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(13+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt507);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(14+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt508);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(15+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt509);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(16+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt510);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(1+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt495);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(2+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt496);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(3+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt497);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(4+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt498);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(5+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt499);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(6+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt500);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(7+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt501);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(8+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt502);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(9+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt503);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(10+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt504);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(11+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt505);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(12+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt506);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(13+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt507);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(14+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt508);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(15+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt509);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(16+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt510);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(1+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt495);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(2+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt496);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(3+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt497);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(4+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt498);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(5+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt499);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(6+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt500);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(7+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt501);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(8+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt502);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(9+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt503);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(10+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt504);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(11+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt505);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(12+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt506);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(13+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt507);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(14+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt508);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(15+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt509);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(16+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt510);
}
}
}
} else {
ptrdiff_t k141 = 1008;
ptrdiff_t l60 = (size_t)(0+k141)/6;
ptrdiff_t cut18 = (size_t)(0+k141)%6;
__m512 sum390 = _mm512_maskz_loadu_ps(65535, biasPtr14+4096*i53+4*k141);
__m512i pmMul30 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd30 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo25 = _mm512_loadu_ps(bnPtr15+(ptrdiff_t)8*(k141+1024*i53));
__m512 masHi25 = _mm512_maskz_loadu_ps(65535, bnPtr15+(ptrdiff_t)8*(k141+1024*i53)+(ptrdiff_t)64);
__m512 postMul44 = _mm512_permutex2var_ps(masLo25, pmMul30, masHi25);
__m512 postAdd28 = _mm512_permutex2var_ps(masLo25, pmAdd30, masHi25);
sum390 = _mm512_fmadd_ps(sum390, postMul44, postAdd28);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*0+(ptrdiff_t)0, 63>>cut18, sum390);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*0+(ptrdiff_t)12288, 4032>>cut18, sum390);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*0+(ptrdiff_t)24576, 65535-(4095>>cut18), sum390);
ptrdiff_t c40 = 0;
for (; c40 != 32; ++c40) {
__m512 wt463 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)0);
__m512 wt464 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)2048);
__m512 wt465 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)4096);
__m512 wt466 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)6144);
__m512 wt467 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)8192);
__m512 wt468 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)10240);
__m512 wt469 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)12288);
__m512 wt470 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)14336);
__m512 wt471 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)16384);
__m512 wt472 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)18432);
__m512 wt473 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)20480);
__m512 wt474 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)22528);
__m512 wt475 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)24576);
__m512 wt476 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)26624);
__m512 wt477 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)28672);
__m512 wt478 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)30720);
__m512 tmp13795 = _mm512_unpacklo_ps(wt463, wt464);
__m512 tmp13796 = _mm512_unpackhi_ps(wt463, wt464);
__m512 tmp13797 = _mm512_unpacklo_ps(wt465, wt466);
__m512 tmp13798 = _mm512_unpackhi_ps(wt465, wt466);
__m512 tmp13799 = _mm512_unpacklo_ps(wt467, wt468);
__m512 tmp13800 = _mm512_unpackhi_ps(wt467, wt468);
__m512 tmp13801 = _mm512_unpacklo_ps(wt469, wt470);
__m512 tmp13802 = _mm512_unpackhi_ps(wt469, wt470);
__m512 tmp13803 = _mm512_unpacklo_ps(wt471, wt472);
__m512 tmp13804 = _mm512_unpackhi_ps(wt471, wt472);
__m512 tmp13805 = _mm512_unpacklo_ps(wt473, wt474);
__m512 tmp13806 = _mm512_unpackhi_ps(wt473, wt474);
__m512 tmp13807 = _mm512_unpacklo_ps(wt475, wt476);
__m512 tmp13808 = _mm512_unpackhi_ps(wt475, wt476);
__m512 tmp13809 = _mm512_unpacklo_ps(wt477, wt478);
__m512 tmp13810 = _mm512_unpackhi_ps(wt477, wt478);
__m512 tmp13811 = _mm512_shuffle_ps(tmp13795, tmp13797, 68);
__m512 tmp13812 = _mm512_shuffle_ps(tmp13795, tmp13797, 238);
__m512 tmp13813 = _mm512_shuffle_ps(tmp13796, tmp13798, 68);
__m512 tmp13814 = _mm512_shuffle_ps(tmp13796, tmp13798, 238);
__m512 tmp13815 = _mm512_shuffle_ps(tmp13799, tmp13801, 68);
__m512 tmp13816 = _mm512_shuffle_ps(tmp13799, tmp13801, 238);
__m512 tmp13817 = _mm512_shuffle_ps(tmp13800, tmp13802, 68);
__m512 tmp13818 = _mm512_shuffle_ps(tmp13800, tmp13802, 238);
__m512 tmp13819 = _mm512_shuffle_ps(tmp13803, tmp13805, 68);
__m512 tmp13820 = _mm512_shuffle_ps(tmp13803, tmp13805, 238);
__m512 tmp13821 = _mm512_shuffle_ps(tmp13804, tmp13806, 68);
__m512 tmp13822 = _mm512_shuffle_ps(tmp13804, tmp13806, 238);
__m512 tmp13823 = _mm512_shuffle_ps(tmp13807, tmp13809, 68);
__m512 tmp13824 = _mm512_shuffle_ps(tmp13807, tmp13809, 238);
__m512 tmp13825 = _mm512_shuffle_ps(tmp13808, tmp13810, 68);
__m512 tmp13826 = _mm512_shuffle_ps(tmp13808, tmp13810, 238);
__m512 tmp13827 = _mm512_shuffle_f32x4(tmp13811, tmp13815, 136);
__m512 tmp13828 = _mm512_shuffle_f32x4(tmp13811, tmp13815, 221);
__m512 tmp13829 = _mm512_shuffle_f32x4(tmp13812, tmp13816, 136);
__m512 tmp13830 = _mm512_shuffle_f32x4(tmp13812, tmp13816, 221);
__m512 tmp13831 = _mm512_shuffle_f32x4(tmp13813, tmp13817, 136);
__m512 tmp13832 = _mm512_shuffle_f32x4(tmp13813, tmp13817, 221);
__m512 tmp13833 = _mm512_shuffle_f32x4(tmp13814, tmp13818, 136);
__m512 tmp13834 = _mm512_shuffle_f32x4(tmp13814, tmp13818, 221);
__m512 tmp13835 = _mm512_shuffle_f32x4(tmp13819, tmp13823, 136);
__m512 tmp13836 = _mm512_shuffle_f32x4(tmp13819, tmp13823, 221);
__m512 tmp13837 = _mm512_shuffle_f32x4(tmp13820, tmp13824, 136);
__m512 tmp13838 = _mm512_shuffle_f32x4(tmp13820, tmp13824, 221);
__m512 tmp13839 = _mm512_shuffle_f32x4(tmp13821, tmp13825, 136);
__m512 tmp13840 = _mm512_shuffle_f32x4(tmp13821, tmp13825, 221);
__m512 tmp13841 = _mm512_shuffle_f32x4(tmp13822, tmp13826, 136);
__m512 tmp13842 = _mm512_shuffle_f32x4(tmp13822, tmp13826, 221);
wt463 = _mm512_shuffle_f32x4(tmp13827, tmp13835, 136);
wt471 = _mm512_shuffle_f32x4(tmp13827, tmp13835, 221);
wt464 = _mm512_shuffle_f32x4(tmp13829, tmp13837, 136);
wt472 = _mm512_shuffle_f32x4(tmp13829, tmp13837, 221);
wt465 = _mm512_shuffle_f32x4(tmp13831, tmp13839, 136);
wt473 = _mm512_shuffle_f32x4(tmp13831, tmp13839, 221);
wt466 = _mm512_shuffle_f32x4(tmp13833, tmp13841, 136);
wt474 = _mm512_shuffle_f32x4(tmp13833, tmp13841, 221);
wt467 = _mm512_shuffle_f32x4(tmp13828, tmp13836, 136);
wt475 = _mm512_shuffle_f32x4(tmp13828, tmp13836, 221);
wt468 = _mm512_shuffle_f32x4(tmp13830, tmp13838, 136);
wt476 = _mm512_shuffle_f32x4(tmp13830, tmp13838, 221);
wt469 = _mm512_shuffle_f32x4(tmp13832, tmp13840, 136);
wt477 = _mm512_shuffle_f32x4(tmp13832, tmp13840, 221);
wt470 = _mm512_shuffle_f32x4(tmp13834, tmp13842, 136);
wt478 = _mm512_shuffle_f32x4(tmp13834, tmp13842, 221);
wt463 = _mm512_mul_ps(wt463, postMul44);
wt464 = _mm512_mul_ps(wt464, postMul44);
wt465 = _mm512_mul_ps(wt465, postMul44);
wt466 = _mm512_mul_ps(wt466, postMul44);
wt467 = _mm512_mul_ps(wt467, postMul44);
wt468 = _mm512_mul_ps(wt468, postMul44);
wt469 = _mm512_mul_ps(wt469, postMul44);
wt470 = _mm512_mul_ps(wt470, postMul44);
wt471 = _mm512_mul_ps(wt471, postMul44);
wt472 = _mm512_mul_ps(wt472, postMul44);
wt473 = _mm512_mul_ps(wt473, postMul44);
wt474 = _mm512_mul_ps(wt474, postMul44);
wt475 = _mm512_mul_ps(wt475, postMul44);
wt476 = _mm512_mul_ps(wt476, postMul44);
wt477 = _mm512_mul_ps(wt477, postMul44);
wt478 = _mm512_mul_ps(wt478, postMul44);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(1+16*c40)+(ptrdiff_t)0, 63>>cut18, wt463);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(2+16*c40)+(ptrdiff_t)0, 63>>cut18, wt464);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(3+16*c40)+(ptrdiff_t)0, 63>>cut18, wt465);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(4+16*c40)+(ptrdiff_t)0, 63>>cut18, wt466);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(5+16*c40)+(ptrdiff_t)0, 63>>cut18, wt467);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(6+16*c40)+(ptrdiff_t)0, 63>>cut18, wt468);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(7+16*c40)+(ptrdiff_t)0, 63>>cut18, wt469);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(8+16*c40)+(ptrdiff_t)0, 63>>cut18, wt470);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(9+16*c40)+(ptrdiff_t)0, 63>>cut18, wt471);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(10+16*c40)+(ptrdiff_t)0, 63>>cut18, wt472);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(11+16*c40)+(ptrdiff_t)0, 63>>cut18, wt473);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(12+16*c40)+(ptrdiff_t)0, 63>>cut18, wt474);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(13+16*c40)+(ptrdiff_t)0, 63>>cut18, wt475);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(14+16*c40)+(ptrdiff_t)0, 63>>cut18, wt476);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(15+16*c40)+(ptrdiff_t)0, 63>>cut18, wt477);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(16+16*c40)+(ptrdiff_t)0, 63>>cut18, wt478);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(1+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt463);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(2+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt464);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(3+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt465);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(4+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt466);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(5+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt467);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(6+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt468);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(7+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt469);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(8+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt470);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(9+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt471);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(10+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt472);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(11+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt473);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(12+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt474);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(13+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt475);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(14+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt476);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(15+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt477);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(16+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt478);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(1+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt463);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(2+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt464);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(3+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt465);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(4+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt466);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(5+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt467);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(6+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt468);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(7+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt469);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(8+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt470);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(9+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt471);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(10+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt472);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(11+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt473);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(12+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt474);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(13+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt475);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(14+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt476);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(15+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt477);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(16+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt478);
}
}
}
}
}

static void ResNeXt50OneArrangeWts8(ResNeXt50ThreaderTeam1* team56, char** tensors85) {
ResNeXt50ThreaderTask1 task89;
task89.callee1 = ResNeXt50OneArrangeWts8Callee1;
task89.any1 = tensors85;
task89.nd1 = 3;
task89.hull1[0] = 64;
task89.hull1[1] = 1;
task89.hull1[2] = 1;
ResNeXt50ThreaderDo1(team56, &task89);
}

static void ResNeXt50OneArrangeDats8Callee1(ResNeXt50ThreaderTask1* task90, int64_t* pt50) {
char** tensors88 = task90->any1;
ptrdiff_t s47 = pt50[0];
ptrdiff_t c43 = pt50[1];
char*restrict datPtr27 = tensors88[0]+(ptrdiff_t)0+(ptrdiff_t)2618560*0+(ptrdiff_t)1605632*0;
char*restrict arranged16 = tensors88[1]+(ptrdiff_t)748160*0+(ptrdiff_t)458752*0;
ptrdiff_t ii39 = 1;
for (ptrdiff_t i54 = 0; i54 < ii39; ++i54) {
ptrdiff_t j46 = 1*c43;
ptrdiff_t jj46 = j46+0;
if (j46 < 3) {
ptrdiff_t h48 = 0+((size_t)j46-0)/1*8;
switch (((size_t)j46-0)%1) {
default: {
wrap5:;
ptrdiff_t k143 = 128*s47;
ptrdiff_t kk45 = k143+128;
for (; k143 < kk45; ++k143) {
__m512 dat2259 = _mm512_maskz_loadu_ps(32767, datPtr27+1605632*i54+112*h48+3136*k143+(ptrdiff_t)0);
__m512 dat2260 = _mm512_maskz_loadu_ps(2047, datPtr27+1605632*i54+112*h48+3136*k143+(ptrdiff_t)64);
__m512i pm201 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2261 = _mm512_permutex2var_ps(dat2259, pm201, dat2260);
__m512 dat2262 = _mm512_maskz_loadu_ps(32767, datPtr27+1605632*i54+112*h48+3136*k143+(ptrdiff_t)224);
__m512 dat2263 = _mm512_maskz_loadu_ps(2047, datPtr27+1605632*i54+112*h48+3136*k143+(ptrdiff_t)288);
__m512i pm202 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2264 = _mm512_permutex2var_ps(dat2262, pm202, dat2263);
__m512 dat2265 = _mm512_maskz_loadu_ps(32767, datPtr27+1605632*i54+112*h48+3136*k143+(ptrdiff_t)448);
__m512 dat2266 = _mm512_maskz_loadu_ps(2047, datPtr27+1605632*i54+112*h48+3136*k143+(ptrdiff_t)512);
__m512i pm203 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2267 = _mm512_permutex2var_ps(dat2265, pm203, dat2266);
__m512 dat2268 = _mm512_maskz_loadu_ps(32767, datPtr27+1605632*i54+112*h48+3136*k143+(ptrdiff_t)672);
__m512 dat2269 = _mm512_maskz_loadu_ps(2047, datPtr27+1605632*i54+112*h48+3136*k143+(ptrdiff_t)736);
__m512i pm204 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2270 = _mm512_permutex2var_ps(dat2268, pm204, dat2269);
_mm512_storeu_ps(arranged16+458752*i54+131072*j46+256*k143+(ptrdiff_t)0, dat2261);
_mm512_storeu_ps(arranged16+458752*i54+131072*j46+256*k143+(ptrdiff_t)64, dat2264);
_mm512_storeu_ps(arranged16+458752*i54+131072*j46+256*k143+(ptrdiff_t)128, dat2267);
_mm512_storeu_ps(arranged16+458752*i54+131072*j46+256*k143+(ptrdiff_t)192, dat2270);
}
if (j46 >= jj46) goto next8;
if (j46 >= 2) break;
++j46;
h48 += 8;
goto wrap5;
}
}
j46 = 3;
}
switch ((size_t)j46-3) {
default: {
j46 = 3;
ptrdiff_t k144 = 128*s47;
ptrdiff_t kk46 = k144+128;
for (; k144 < kk46; ++k144) {
__m512 dat2271 = _mm512_maskz_loadu_ps(32767, datPtr27+1605632*i54+112*(ptrdiff_t)24+3136*k144+(ptrdiff_t)0);
__m512 dat2272 = _mm512_maskz_loadu_ps(2047, datPtr27+1605632*i54+112*(ptrdiff_t)24+3136*k144+(ptrdiff_t)64);
__m512i pm205 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2273 = _mm512_permutex2var_ps(dat2271, pm205, dat2272);
__m512 dat2274 = _mm512_maskz_loadu_ps(32767, datPtr27+1605632*i54+112*(ptrdiff_t)24+3136*k144+(ptrdiff_t)224);
__m512 dat2275 = _mm512_maskz_loadu_ps(2047, datPtr27+1605632*i54+112*(ptrdiff_t)24+3136*k144+(ptrdiff_t)288);
__m512i pm206 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2276 = _mm512_permutex2var_ps(dat2274, pm206, dat2275);
_mm512_storeu_ps(arranged16+458752*i54+131072*j46+128*k144+(ptrdiff_t)0, dat2273);
_mm512_storeu_ps(arranged16+458752*i54+131072*j46+128*k144+(ptrdiff_t)64, dat2276);
}
if (j46 >= jj46) goto next8;
}
}
j46 = 4;
next8:;
}
}

static void ResNeXt50OneArrangeDats8(ResNeXt50ThreaderTeam1* team57, char** tensors87) {
ResNeXt50ThreaderTask1 task91;
task91.callee1 = ResNeXt50OneArrangeDats8Callee1;
task91.any1 = tensors87;
task91.nd1 = 4;
task91.hull1[0] = 4;
task91.hull1[1] = 4;
task91.hull1[2] = 1;
task91.hull1[3] = 1;
ResNeXt50ThreaderDo1(team57, &task91);
}

static void ResNeXt50OneApply8Callee1(ResNeXt50ThreaderTask1* task92, int64_t* pt51) {
void** pair22 = task92->any1;
char** tensors90 = pair22[0];
ptrdiff_t e26 = 0;
ptrdiff_t g29 = 0;
ptrdiff_t d18 = pt51[1];
ptrdiff_t w63 = pt51[0];
char*restrict arrangedWts8 = tensors90[0]+3424256*e26+(ptrdiff_t)2101248*1*g29;
char*restrict arrangedDats8 = tensors90[1]+748160*e26+(ptrdiff_t)458752*1*g29;
char*restrict datPtr28 = tensors90[2]+(ptrdiff_t)851968*1*g29;
ptrdiff_t ii40 = 1;
for (ptrdiff_t i55 = 0; i55 < ii40; ++i55) {
ptrdiff_t j47 = 1*d18;
ptrdiff_t jj47 = j47+0;
if (j47 < 3) {
ptrdiff_t h49 = 0+((size_t)j47-0)/1*4;
switch (((size_t)j47-0)%1) {
default: {
wrap6:;
ptrdiff_t k145 = 1*w63;
ptrdiff_t kk47 = k145+0;
for (; k145 != 170; ++k145) {
ptrdiff_t s48 = -1;
__m512 sum393 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)24));
__m512 sum397 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)28));
__m512 sum401 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)32));
__m512 sum405 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)36));
__m512 sum409 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)40));
__m512 sum413 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)44));
__m512 sum394 = sum393;
__m512 sum395 = sum393;
__m512 sum396 = sum393;
__m512 sum398 = sum397;
__m512 sum399 = sum397;
__m512 sum400 = sum397;
__m512 sum402 = sum401;
__m512 sum403 = sum401;
__m512 sum404 = sum401;
__m512 sum406 = sum405;
__m512 sum407 = sum405;
__m512 sum408 = sum405;
__m512 sum410 = sum409;
__m512 sum411 = sum409;
__m512 sum412 = sum409;
__m512 sum414 = sum413;
__m512 sum415 = sum413;
__m512 sum416 = sum413;
for (s48 = 0; s48 < 512; ++s48) {
__m512 dat2277 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+256*s48+(ptrdiff_t)0);
__m512 dat2278 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+256*s48+(ptrdiff_t)64);
__m512 dat2279 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+256*s48+(ptrdiff_t)128);
__m512 dat2280 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+256*s48+(ptrdiff_t)192);
__m512 wt511 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)24));
sum393 = _mm512_fmadd_ps(wt511, dat2277, sum393);
sum394 = _mm512_fmadd_ps(wt511, dat2278, sum394);
sum395 = _mm512_fmadd_ps(wt511, dat2279, sum395);
sum396 = _mm512_fmadd_ps(wt511, dat2280, sum396);
__m512 wt512 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)28));
sum397 = _mm512_fmadd_ps(wt512, dat2277, sum397);
sum398 = _mm512_fmadd_ps(wt512, dat2278, sum398);
sum399 = _mm512_fmadd_ps(wt512, dat2279, sum399);
sum400 = _mm512_fmadd_ps(wt512, dat2280, sum400);
__m512 wt513 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)32));
sum401 = _mm512_fmadd_ps(wt513, dat2277, sum401);
sum402 = _mm512_fmadd_ps(wt513, dat2278, sum402);
sum403 = _mm512_fmadd_ps(wt513, dat2279, sum403);
sum404 = _mm512_fmadd_ps(wt513, dat2280, sum404);
__m512 wt514 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)36));
sum405 = _mm512_fmadd_ps(wt514, dat2277, sum405);
sum406 = _mm512_fmadd_ps(wt514, dat2278, sum406);
sum407 = _mm512_fmadd_ps(wt514, dat2279, sum407);
sum408 = _mm512_fmadd_ps(wt514, dat2280, sum408);
__m512 wt515 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)40));
sum409 = _mm512_fmadd_ps(wt515, dat2277, sum409);
sum410 = _mm512_fmadd_ps(wt515, dat2278, sum410);
sum411 = _mm512_fmadd_ps(wt515, dat2279, sum411);
sum412 = _mm512_fmadd_ps(wt515, dat2280, sum412);
__m512 wt516 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)44));
sum413 = _mm512_fmadd_ps(wt516, dat2277, sum413);
sum414 = _mm512_fmadd_ps(wt516, dat2278, sum414);
sum415 = _mm512_fmadd_ps(wt516, dat2279, sum415);
sum416 = _mm512_fmadd_ps(wt516, dat2280, sum416);
}
__m512 dat2281 = sum393;
__m512 dat2282 = sum394;
__m512 dat2283 = sum395;
__m512 dat2284 = sum396;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)0, 16383, dat2281);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)56, 16383, dat2282);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)112, 16383, dat2283);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)168, 16383, dat2284);
__m512 dat2285 = sum397;
__m512 dat2286 = sum398;
__m512 dat2287 = sum399;
__m512 dat2288 = sum400;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)832, 16383, dat2285);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)888, 16383, dat2286);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)944, 16383, dat2287);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)1000, 16383, dat2288);
__m512 dat2289 = sum401;
__m512 dat2290 = sum402;
__m512 dat2291 = sum403;
__m512 dat2292 = sum404;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)1664, 16383, dat2289);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)1720, 16383, dat2290);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)1776, 16383, dat2291);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)1832, 16383, dat2292);
__m512 dat2293 = sum405;
__m512 dat2294 = sum406;
__m512 dat2295 = sum407;
__m512 dat2296 = sum408;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)2496, 16383, dat2293);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)2552, 16383, dat2294);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)2608, 16383, dat2295);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)2664, 16383, dat2296);
__m512 dat2297 = sum409;
__m512 dat2298 = sum410;
__m512 dat2299 = sum411;
__m512 dat2300 = sum412;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)3328, 16383, dat2297);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)3384, 16383, dat2298);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)3440, 16383, dat2299);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)3496, 16383, dat2300);
__m512 dat2301 = sum413;
__m512 dat2302 = sum414;
__m512 dat2303 = sum415;
__m512 dat2304 = sum416;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)4160, 16383, dat2301);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)4216, 16383, dat2302);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)4272, 16383, dat2303);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)4328, 16383, dat2304);
if (k145 >= kk47) return;
}
ptrdiff_t s49 = -1;
__m512 sum417 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+16*s49+(ptrdiff_t)16));
__m512 sum421 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+16*s49+(ptrdiff_t)20));
__m512 sum425 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+16*s49+(ptrdiff_t)24));
__m512 sum429 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+16*s49+(ptrdiff_t)28));
__m512 sum418 = sum417;
__m512 sum419 = sum417;
__m512 sum420 = sum417;
__m512 sum422 = sum421;
__m512 sum423 = sum421;
__m512 sum424 = sum421;
__m512 sum426 = sum425;
__m512 sum427 = sum425;
__m512 sum428 = sum425;
__m512 sum430 = sum429;
__m512 sum431 = sum429;
__m512 sum432 = sum429;
for (s49 = 0; s49 < 512; ++s49) {
__m512 dat2305 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+256*s49+(ptrdiff_t)0);
__m512 dat2306 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+256*s49+(ptrdiff_t)64);
__m512 dat2307 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+256*s49+(ptrdiff_t)128);
__m512 dat2308 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+256*s49+(ptrdiff_t)192);
__m512 wt517 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+16*s49+(ptrdiff_t)16));
sum417 = _mm512_fmadd_ps(wt517, dat2305, sum417);
sum418 = _mm512_fmadd_ps(wt517, dat2306, sum418);
sum419 = _mm512_fmadd_ps(wt517, dat2307, sum419);
sum420 = _mm512_fmadd_ps(wt517, dat2308, sum420);
__m512 wt518 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+16*s49+(ptrdiff_t)20));
sum421 = _mm512_fmadd_ps(wt518, dat2305, sum421);
sum422 = _mm512_fmadd_ps(wt518, dat2306, sum422);
sum423 = _mm512_fmadd_ps(wt518, dat2307, sum423);
sum424 = _mm512_fmadd_ps(wt518, dat2308, sum424);
__m512 wt519 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+16*s49+(ptrdiff_t)24));
sum425 = _mm512_fmadd_ps(wt519, dat2305, sum425);
sum426 = _mm512_fmadd_ps(wt519, dat2306, sum426);
sum427 = _mm512_fmadd_ps(wt519, dat2307, sum427);
sum428 = _mm512_fmadd_ps(wt519, dat2308, sum428);
__m512 wt520 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+16*s49+(ptrdiff_t)28));
sum429 = _mm512_fmadd_ps(wt520, dat2305, sum429);
sum430 = _mm512_fmadd_ps(wt520, dat2306, sum430);
sum431 = _mm512_fmadd_ps(wt520, dat2307, sum431);
sum432 = _mm512_fmadd_ps(wt520, dat2308, sum432);
}
__m512 dat2309 = sum417;
__m512 dat2310 = sum418;
__m512 dat2311 = sum419;
__m512 dat2312 = sum420;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)0, 16383, dat2309);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)56, 16383, dat2310);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)112, 16383, dat2311);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)168, 16383, dat2312);
__m512 dat2313 = sum421;
__m512 dat2314 = sum422;
__m512 dat2315 = sum423;
__m512 dat2316 = sum424;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)832, 16383, dat2313);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)888, 16383, dat2314);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)944, 16383, dat2315);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)1000, 16383, dat2316);
__m512 dat2317 = sum425;
__m512 dat2318 = sum426;
__m512 dat2319 = sum427;
__m512 dat2320 = sum428;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)1664, 16383, dat2317);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)1720, 16383, dat2318);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)1776, 16383, dat2319);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)1832, 16383, dat2320);
__m512 dat2321 = sum429;
__m512 dat2322 = sum430;
__m512 dat2323 = sum431;
__m512 dat2324 = sum432;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)2496, 16383, dat2321);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)2552, 16383, dat2322);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)2608, 16383, dat2323);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)2664, 16383, dat2324);
if (j47 >= jj47) return;
if (j47 >= 2) break;
++j47;
h49 += 4;
goto wrap6;
}
}
j47 = 3;
}
ptrdiff_t h50 = 12;
switch (j47) {
default: {
j47 = 3;
ptrdiff_t k146 = 1*w63;
ptrdiff_t kk48 = k146+0;
for (; k146 != 170; ++k146) {
ptrdiff_t s50 = -1;
__m512 sum433 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)24));
__m512 sum435 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)28));
__m512 sum437 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)32));
__m512 sum439 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)36));
__m512 sum441 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)40));
__m512 sum443 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)44));
__m512 sum434 = sum433;
__m512 sum436 = sum435;
__m512 sum438 = sum437;
__m512 sum440 = sum439;
__m512 sum442 = sum441;
__m512 sum444 = sum443;
for (s50 = 0; s50 < 512; ++s50) {
__m512 dat2325 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+128*s50+(ptrdiff_t)0);
__m512 dat2326 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+128*s50+(ptrdiff_t)64);
__m512 wt521 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)24));
sum433 = _mm512_fmadd_ps(wt521, dat2325, sum433);
sum434 = _mm512_fmadd_ps(wt521, dat2326, sum434);
__m512 wt522 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)28));
sum435 = _mm512_fmadd_ps(wt522, dat2325, sum435);
sum436 = _mm512_fmadd_ps(wt522, dat2326, sum436);
__m512 wt523 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)32));
sum437 = _mm512_fmadd_ps(wt523, dat2325, sum437);
sum438 = _mm512_fmadd_ps(wt523, dat2326, sum438);
__m512 wt524 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)36));
sum439 = _mm512_fmadd_ps(wt524, dat2325, sum439);
sum440 = _mm512_fmadd_ps(wt524, dat2326, sum440);
__m512 wt525 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)40));
sum441 = _mm512_fmadd_ps(wt525, dat2325, sum441);
sum442 = _mm512_fmadd_ps(wt525, dat2326, sum442);
__m512 wt526 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)44));
sum443 = _mm512_fmadd_ps(wt526, dat2325, sum443);
sum444 = _mm512_fmadd_ps(wt526, dat2326, sum444);
}
__m512 dat2327 = sum433;
__m512 dat2328 = sum434;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)0, 16383, dat2327);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)56, 16383, dat2328);
__m512 dat2329 = sum435;
__m512 dat2330 = sum436;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)832, 16383, dat2329);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)888, 16383, dat2330);
__m512 dat2331 = sum437;
__m512 dat2332 = sum438;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)1664, 16383, dat2331);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)1720, 16383, dat2332);
__m512 dat2333 = sum439;
__m512 dat2334 = sum440;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)2496, 16383, dat2333);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)2552, 16383, dat2334);
__m512 dat2335 = sum441;
__m512 dat2336 = sum442;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)3328, 16383, dat2335);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)3384, 16383, dat2336);
__m512 dat2337 = sum443;
__m512 dat2338 = sum444;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)4160, 16383, dat2337);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)4216, 16383, dat2338);
if (k146 >= kk48) return;
}
ptrdiff_t s51 = -1;
__m512 sum445 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+16*s51+(ptrdiff_t)16));
__m512 sum447 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+16*s51+(ptrdiff_t)20));
__m512 sum449 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+16*s51+(ptrdiff_t)24));
__m512 sum451 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+16*s51+(ptrdiff_t)28));
__m512 sum446 = sum445;
__m512 sum448 = sum447;
__m512 sum450 = sum449;
__m512 sum452 = sum451;
for (s51 = 0; s51 < 512; ++s51) {
__m512 dat2339 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+128*s51+(ptrdiff_t)0);
__m512 dat2340 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+128*s51+(ptrdiff_t)64);
__m512 wt527 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+16*s51+(ptrdiff_t)16));
sum445 = _mm512_fmadd_ps(wt527, dat2339, sum445);
sum446 = _mm512_fmadd_ps(wt527, dat2340, sum446);
__m512 wt528 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+16*s51+(ptrdiff_t)20));
sum447 = _mm512_fmadd_ps(wt528, dat2339, sum447);
sum448 = _mm512_fmadd_ps(wt528, dat2340, sum448);
__m512 wt529 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+16*s51+(ptrdiff_t)24));
sum449 = _mm512_fmadd_ps(wt529, dat2339, sum449);
sum450 = _mm512_fmadd_ps(wt529, dat2340, sum450);
__m512 wt530 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+16*s51+(ptrdiff_t)28));
sum451 = _mm512_fmadd_ps(wt530, dat2339, sum451);
sum452 = _mm512_fmadd_ps(wt530, dat2340, sum452);
}
__m512 dat2341 = sum445;
__m512 dat2342 = sum446;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)0, 16383, dat2341);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)56, 16383, dat2342);
__m512 dat2343 = sum447;
__m512 dat2344 = sum448;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)832, 16383, dat2343);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)888, 16383, dat2344);
__m512 dat2345 = sum449;
__m512 dat2346 = sum450;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)1664, 16383, dat2345);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)1720, 16383, dat2346);
__m512 dat2347 = sum451;
__m512 dat2348 = sum452;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)2496, 16383, dat2347);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)2552, 16383, dat2348);
if (j47 >= jj47) return;
}
}
j47 = 4;
}
}

static void ResNeXt50OneApply8(ResNeXt50ThreaderTeam1* team58, char** tensors89) {
void* pair21[] = {tensors89, 0};
ResNeXt50ThreaderTask1 task93;
task93.callee1 = ResNeXt50OneApply8Callee1;
task93.any1 = pair21;
task93.nd1 = 3;
task93.hull1[0] = 171;
task93.hull1[1] = 4;
task93.hull1[2] = 1;
ResNeXt50ThreaderDo1(team58, &task93);
}

static void ResNeXt50OneArrangeWts9Callee1(ResNeXt50ThreaderTask1* task94, int64_t* pt52) {
char** tensors92 = task94->any1;
ptrdiff_t b75 = pt52[0];
char*restrict wtPtr15 = tensors92[0]+(ptrdiff_t)3340*0+(ptrdiff_t)1048576*0;
char*restrict biasPtr15 = tensors92[1]+(ptrdiff_t)2048*0;
char*restrict bnPtr16 = tensors92[2]+(ptrdiff_t)8*512*0;
char*restrict arranged17 = tensors92[3]+(ptrdiff_t)1712128*0+(ptrdiff_t)1050624*0;
ptrdiff_t ii41 = 1;
for (ptrdiff_t i56 = 0; i56 < ii41; ++i56) {
ptrdiff_t j48 = 1*b75;
ptrdiff_t jj48 = j48+1;
for (; j48 < jj48; ++j48) {
if (j48 < 31) {
ptrdiff_t k148 = 0+16*(j48-0);
ptrdiff_t l63 = (size_t)(0+k148)/6;
ptrdiff_t cut21 = (size_t)(0+k148)%6;
switch (cut21) {
case 0:;
case 2: {
__m512 sum454 = _mm512_maskz_loadu_ps(65535, biasPtr15+2048*i56+4*k148);
__m512i pmMul31 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd31 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo26 = _mm512_loadu_ps(bnPtr16+(ptrdiff_t)8*(k148+512*i56));
__m512 masHi26 = _mm512_maskz_loadu_ps(65535, bnPtr16+(ptrdiff_t)8*(k148+512*i56)+(ptrdiff_t)64);
__m512 postMul48 = _mm512_permutex2var_ps(masLo26, pmMul31, masHi26);
__m512 postAdd32 = _mm512_permutex2var_ps(masLo26, pmAdd31, masHi26);
sum454 = _mm512_fmadd_ps(sum454, postMul48, postAdd32);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*0+(ptrdiff_t)0, 63>>cut21, sum454);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*0+(ptrdiff_t)12288, 4032>>cut21, sum454);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*0+(ptrdiff_t)24576, 65535-(4095>>cut21), sum454);
ptrdiff_t c45 = 0;
for (; c45 != 32; ++c45) {
__m512 wt547 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)0);
__m512 wt548 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)2048);
__m512 wt549 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)4096);
__m512 wt550 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)6144);
__m512 wt551 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)8192);
__m512 wt552 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)10240);
__m512 wt553 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)12288);
__m512 wt554 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)14336);
__m512 wt555 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)16384);
__m512 wt556 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)18432);
__m512 wt557 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)20480);
__m512 wt558 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)22528);
__m512 wt559 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)24576);
__m512 wt560 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)26624);
__m512 wt561 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)28672);
__m512 wt562 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)30720);
__m512 tmp13843 = _mm512_unpacklo_ps(wt547, wt548);
__m512 tmp13844 = _mm512_unpackhi_ps(wt547, wt548);
__m512 tmp13845 = _mm512_unpacklo_ps(wt549, wt550);
__m512 tmp13846 = _mm512_unpackhi_ps(wt549, wt550);
__m512 tmp13847 = _mm512_unpacklo_ps(wt551, wt552);
__m512 tmp13848 = _mm512_unpackhi_ps(wt551, wt552);
__m512 tmp13849 = _mm512_unpacklo_ps(wt553, wt554);
__m512 tmp13850 = _mm512_unpackhi_ps(wt553, wt554);
__m512 tmp13851 = _mm512_unpacklo_ps(wt555, wt556);
__m512 tmp13852 = _mm512_unpackhi_ps(wt555, wt556);
__m512 tmp13853 = _mm512_unpacklo_ps(wt557, wt558);
__m512 tmp13854 = _mm512_unpackhi_ps(wt557, wt558);
__m512 tmp13855 = _mm512_unpacklo_ps(wt559, wt560);
__m512 tmp13856 = _mm512_unpackhi_ps(wt559, wt560);
__m512 tmp13857 = _mm512_unpacklo_ps(wt561, wt562);
__m512 tmp13858 = _mm512_unpackhi_ps(wt561, wt562);
__m512 tmp13859 = _mm512_shuffle_ps(tmp13843, tmp13845, 68);
__m512 tmp13860 = _mm512_shuffle_ps(tmp13843, tmp13845, 238);
__m512 tmp13861 = _mm512_shuffle_ps(tmp13844, tmp13846, 68);
__m512 tmp13862 = _mm512_shuffle_ps(tmp13844, tmp13846, 238);
__m512 tmp13863 = _mm512_shuffle_ps(tmp13847, tmp13849, 68);
__m512 tmp13864 = _mm512_shuffle_ps(tmp13847, tmp13849, 238);
__m512 tmp13865 = _mm512_shuffle_ps(tmp13848, tmp13850, 68);
__m512 tmp13866 = _mm512_shuffle_ps(tmp13848, tmp13850, 238);
__m512 tmp13867 = _mm512_shuffle_ps(tmp13851, tmp13853, 68);
__m512 tmp13868 = _mm512_shuffle_ps(tmp13851, tmp13853, 238);
__m512 tmp13869 = _mm512_shuffle_ps(tmp13852, tmp13854, 68);
__m512 tmp13870 = _mm512_shuffle_ps(tmp13852, tmp13854, 238);
__m512 tmp13871 = _mm512_shuffle_ps(tmp13855, tmp13857, 68);
__m512 tmp13872 = _mm512_shuffle_ps(tmp13855, tmp13857, 238);
__m512 tmp13873 = _mm512_shuffle_ps(tmp13856, tmp13858, 68);
__m512 tmp13874 = _mm512_shuffle_ps(tmp13856, tmp13858, 238);
__m512 tmp13875 = _mm512_shuffle_f32x4(tmp13859, tmp13863, 136);
__m512 tmp13876 = _mm512_shuffle_f32x4(tmp13859, tmp13863, 221);
__m512 tmp13877 = _mm512_shuffle_f32x4(tmp13860, tmp13864, 136);
__m512 tmp13878 = _mm512_shuffle_f32x4(tmp13860, tmp13864, 221);
__m512 tmp13879 = _mm512_shuffle_f32x4(tmp13861, tmp13865, 136);
__m512 tmp13880 = _mm512_shuffle_f32x4(tmp13861, tmp13865, 221);
__m512 tmp13881 = _mm512_shuffle_f32x4(tmp13862, tmp13866, 136);
__m512 tmp13882 = _mm512_shuffle_f32x4(tmp13862, tmp13866, 221);
__m512 tmp13883 = _mm512_shuffle_f32x4(tmp13867, tmp13871, 136);
__m512 tmp13884 = _mm512_shuffle_f32x4(tmp13867, tmp13871, 221);
__m512 tmp13885 = _mm512_shuffle_f32x4(tmp13868, tmp13872, 136);
__m512 tmp13886 = _mm512_shuffle_f32x4(tmp13868, tmp13872, 221);
__m512 tmp13887 = _mm512_shuffle_f32x4(tmp13869, tmp13873, 136);
__m512 tmp13888 = _mm512_shuffle_f32x4(tmp13869, tmp13873, 221);
__m512 tmp13889 = _mm512_shuffle_f32x4(tmp13870, tmp13874, 136);
__m512 tmp13890 = _mm512_shuffle_f32x4(tmp13870, tmp13874, 221);
wt547 = _mm512_shuffle_f32x4(tmp13875, tmp13883, 136);
wt555 = _mm512_shuffle_f32x4(tmp13875, tmp13883, 221);
wt548 = _mm512_shuffle_f32x4(tmp13877, tmp13885, 136);
wt556 = _mm512_shuffle_f32x4(tmp13877, tmp13885, 221);
wt549 = _mm512_shuffle_f32x4(tmp13879, tmp13887, 136);
wt557 = _mm512_shuffle_f32x4(tmp13879, tmp13887, 221);
wt550 = _mm512_shuffle_f32x4(tmp13881, tmp13889, 136);
wt558 = _mm512_shuffle_f32x4(tmp13881, tmp13889, 221);
wt551 = _mm512_shuffle_f32x4(tmp13876, tmp13884, 136);
wt559 = _mm512_shuffle_f32x4(tmp13876, tmp13884, 221);
wt552 = _mm512_shuffle_f32x4(tmp13878, tmp13886, 136);
wt560 = _mm512_shuffle_f32x4(tmp13878, tmp13886, 221);
wt553 = _mm512_shuffle_f32x4(tmp13880, tmp13888, 136);
wt561 = _mm512_shuffle_f32x4(tmp13880, tmp13888, 221);
wt554 = _mm512_shuffle_f32x4(tmp13882, tmp13890, 136);
wt562 = _mm512_shuffle_f32x4(tmp13882, tmp13890, 221);
wt547 = _mm512_mul_ps(wt547, postMul48);
wt548 = _mm512_mul_ps(wt548, postMul48);
wt549 = _mm512_mul_ps(wt549, postMul48);
wt550 = _mm512_mul_ps(wt550, postMul48);
wt551 = _mm512_mul_ps(wt551, postMul48);
wt552 = _mm512_mul_ps(wt552, postMul48);
wt553 = _mm512_mul_ps(wt553, postMul48);
wt554 = _mm512_mul_ps(wt554, postMul48);
wt555 = _mm512_mul_ps(wt555, postMul48);
wt556 = _mm512_mul_ps(wt556, postMul48);
wt557 = _mm512_mul_ps(wt557, postMul48);
wt558 = _mm512_mul_ps(wt558, postMul48);
wt559 = _mm512_mul_ps(wt559, postMul48);
wt560 = _mm512_mul_ps(wt560, postMul48);
wt561 = _mm512_mul_ps(wt561, postMul48);
wt562 = _mm512_mul_ps(wt562, postMul48);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(1+16*c45)+(ptrdiff_t)0, 63>>cut21, wt547);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(2+16*c45)+(ptrdiff_t)0, 63>>cut21, wt548);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(3+16*c45)+(ptrdiff_t)0, 63>>cut21, wt549);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(4+16*c45)+(ptrdiff_t)0, 63>>cut21, wt550);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(5+16*c45)+(ptrdiff_t)0, 63>>cut21, wt551);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(6+16*c45)+(ptrdiff_t)0, 63>>cut21, wt552);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(7+16*c45)+(ptrdiff_t)0, 63>>cut21, wt553);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(8+16*c45)+(ptrdiff_t)0, 63>>cut21, wt554);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(9+16*c45)+(ptrdiff_t)0, 63>>cut21, wt555);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(10+16*c45)+(ptrdiff_t)0, 63>>cut21, wt556);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(11+16*c45)+(ptrdiff_t)0, 63>>cut21, wt557);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(12+16*c45)+(ptrdiff_t)0, 63>>cut21, wt558);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(13+16*c45)+(ptrdiff_t)0, 63>>cut21, wt559);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(14+16*c45)+(ptrdiff_t)0, 63>>cut21, wt560);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(15+16*c45)+(ptrdiff_t)0, 63>>cut21, wt561);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(16+16*c45)+(ptrdiff_t)0, 63>>cut21, wt562);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(1+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt547);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(2+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt548);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(3+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt549);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(4+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt550);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(5+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt551);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(6+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt552);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(7+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt553);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(8+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt554);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(9+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt555);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(10+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt556);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(11+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt557);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(12+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt558);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(13+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt559);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(14+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt560);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(15+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt561);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(16+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt562);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(1+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt547);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(2+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt548);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(3+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt549);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(4+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt550);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(5+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt551);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(6+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt552);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(7+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt553);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(8+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt554);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(9+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt555);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(10+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt556);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(11+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt557);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(12+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt558);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(13+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt559);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(14+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt560);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(15+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt561);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(16+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt562);
}
break;
}
default: {
cut21 = 4;
__m512 sum455 = _mm512_maskz_loadu_ps(65535, biasPtr15+2048*i56+4*k148);
__m512i pmMul32 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd32 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo27 = _mm512_loadu_ps(bnPtr16+(ptrdiff_t)8*(k148+512*i56));
__m512 masHi27 = _mm512_maskz_loadu_ps(65535, bnPtr16+(ptrdiff_t)8*(k148+512*i56)+(ptrdiff_t)64);
__m512 postMul49 = _mm512_permutex2var_ps(masLo27, pmMul32, masHi27);
__m512 postAdd33 = _mm512_permutex2var_ps(masLo27, pmAdd32, masHi27);
sum455 = _mm512_fmadd_ps(sum455, postMul49, postAdd33);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*0+(ptrdiff_t)0, 63>>cut21, sum455);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*0+(ptrdiff_t)12288, 4032>>cut21, sum455);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*0+(ptrdiff_t)24576, 258048>>cut21, sum455);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*0+(ptrdiff_t)36864, 65535-(262143>>cut21), sum455);
ptrdiff_t c46 = 0;
for (; c46 != 32; ++c46) {
__m512 wt563 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)0);
__m512 wt564 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)2048);
__m512 wt565 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)4096);
__m512 wt566 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)6144);
__m512 wt567 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)8192);
__m512 wt568 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)10240);
__m512 wt569 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)12288);
__m512 wt570 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)14336);
__m512 wt571 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)16384);
__m512 wt572 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)18432);
__m512 wt573 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)20480);
__m512 wt574 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)22528);
__m512 wt575 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)24576);
__m512 wt576 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)26624);
__m512 wt577 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)28672);
__m512 wt578 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)30720);
__m512 tmp13891 = _mm512_unpacklo_ps(wt563, wt564);
__m512 tmp13892 = _mm512_unpackhi_ps(wt563, wt564);
__m512 tmp13893 = _mm512_unpacklo_ps(wt565, wt566);
__m512 tmp13894 = _mm512_unpackhi_ps(wt565, wt566);
__m512 tmp13895 = _mm512_unpacklo_ps(wt567, wt568);
__m512 tmp13896 = _mm512_unpackhi_ps(wt567, wt568);
__m512 tmp13897 = _mm512_unpacklo_ps(wt569, wt570);
__m512 tmp13898 = _mm512_unpackhi_ps(wt569, wt570);
__m512 tmp13899 = _mm512_unpacklo_ps(wt571, wt572);
__m512 tmp13900 = _mm512_unpackhi_ps(wt571, wt572);
__m512 tmp13901 = _mm512_unpacklo_ps(wt573, wt574);
__m512 tmp13902 = _mm512_unpackhi_ps(wt573, wt574);
__m512 tmp13903 = _mm512_unpacklo_ps(wt575, wt576);
__m512 tmp13904 = _mm512_unpackhi_ps(wt575, wt576);
__m512 tmp13905 = _mm512_unpacklo_ps(wt577, wt578);
__m512 tmp13906 = _mm512_unpackhi_ps(wt577, wt578);
__m512 tmp13907 = _mm512_shuffle_ps(tmp13891, tmp13893, 68);
__m512 tmp13908 = _mm512_shuffle_ps(tmp13891, tmp13893, 238);
__m512 tmp13909 = _mm512_shuffle_ps(tmp13892, tmp13894, 68);
__m512 tmp13910 = _mm512_shuffle_ps(tmp13892, tmp13894, 238);
__m512 tmp13911 = _mm512_shuffle_ps(tmp13895, tmp13897, 68);
__m512 tmp13912 = _mm512_shuffle_ps(tmp13895, tmp13897, 238);
__m512 tmp13913 = _mm512_shuffle_ps(tmp13896, tmp13898, 68);
__m512 tmp13914 = _mm512_shuffle_ps(tmp13896, tmp13898, 238);
__m512 tmp13915 = _mm512_shuffle_ps(tmp13899, tmp13901, 68);
__m512 tmp13916 = _mm512_shuffle_ps(tmp13899, tmp13901, 238);
__m512 tmp13917 = _mm512_shuffle_ps(tmp13900, tmp13902, 68);
__m512 tmp13918 = _mm512_shuffle_ps(tmp13900, tmp13902, 238);
__m512 tmp13919 = _mm512_shuffle_ps(tmp13903, tmp13905, 68);
__m512 tmp13920 = _mm512_shuffle_ps(tmp13903, tmp13905, 238);
__m512 tmp13921 = _mm512_shuffle_ps(tmp13904, tmp13906, 68);
__m512 tmp13922 = _mm512_shuffle_ps(tmp13904, tmp13906, 238);
__m512 tmp13923 = _mm512_shuffle_f32x4(tmp13907, tmp13911, 136);
__m512 tmp13924 = _mm512_shuffle_f32x4(tmp13907, tmp13911, 221);
__m512 tmp13925 = _mm512_shuffle_f32x4(tmp13908, tmp13912, 136);
__m512 tmp13926 = _mm512_shuffle_f32x4(tmp13908, tmp13912, 221);
__m512 tmp13927 = _mm512_shuffle_f32x4(tmp13909, tmp13913, 136);
__m512 tmp13928 = _mm512_shuffle_f32x4(tmp13909, tmp13913, 221);
__m512 tmp13929 = _mm512_shuffle_f32x4(tmp13910, tmp13914, 136);
__m512 tmp13930 = _mm512_shuffle_f32x4(tmp13910, tmp13914, 221);
__m512 tmp13931 = _mm512_shuffle_f32x4(tmp13915, tmp13919, 136);
__m512 tmp13932 = _mm512_shuffle_f32x4(tmp13915, tmp13919, 221);
__m512 tmp13933 = _mm512_shuffle_f32x4(tmp13916, tmp13920, 136);
__m512 tmp13934 = _mm512_shuffle_f32x4(tmp13916, tmp13920, 221);
__m512 tmp13935 = _mm512_shuffle_f32x4(tmp13917, tmp13921, 136);
__m512 tmp13936 = _mm512_shuffle_f32x4(tmp13917, tmp13921, 221);
__m512 tmp13937 = _mm512_shuffle_f32x4(tmp13918, tmp13922, 136);
__m512 tmp13938 = _mm512_shuffle_f32x4(tmp13918, tmp13922, 221);
wt563 = _mm512_shuffle_f32x4(tmp13923, tmp13931, 136);
wt571 = _mm512_shuffle_f32x4(tmp13923, tmp13931, 221);
wt564 = _mm512_shuffle_f32x4(tmp13925, tmp13933, 136);
wt572 = _mm512_shuffle_f32x4(tmp13925, tmp13933, 221);
wt565 = _mm512_shuffle_f32x4(tmp13927, tmp13935, 136);
wt573 = _mm512_shuffle_f32x4(tmp13927, tmp13935, 221);
wt566 = _mm512_shuffle_f32x4(tmp13929, tmp13937, 136);
wt574 = _mm512_shuffle_f32x4(tmp13929, tmp13937, 221);
wt567 = _mm512_shuffle_f32x4(tmp13924, tmp13932, 136);
wt575 = _mm512_shuffle_f32x4(tmp13924, tmp13932, 221);
wt568 = _mm512_shuffle_f32x4(tmp13926, tmp13934, 136);
wt576 = _mm512_shuffle_f32x4(tmp13926, tmp13934, 221);
wt569 = _mm512_shuffle_f32x4(tmp13928, tmp13936, 136);
wt577 = _mm512_shuffle_f32x4(tmp13928, tmp13936, 221);
wt570 = _mm512_shuffle_f32x4(tmp13930, tmp13938, 136);
wt578 = _mm512_shuffle_f32x4(tmp13930, tmp13938, 221);
wt563 = _mm512_mul_ps(wt563, postMul49);
wt564 = _mm512_mul_ps(wt564, postMul49);
wt565 = _mm512_mul_ps(wt565, postMul49);
wt566 = _mm512_mul_ps(wt566, postMul49);
wt567 = _mm512_mul_ps(wt567, postMul49);
wt568 = _mm512_mul_ps(wt568, postMul49);
wt569 = _mm512_mul_ps(wt569, postMul49);
wt570 = _mm512_mul_ps(wt570, postMul49);
wt571 = _mm512_mul_ps(wt571, postMul49);
wt572 = _mm512_mul_ps(wt572, postMul49);
wt573 = _mm512_mul_ps(wt573, postMul49);
wt574 = _mm512_mul_ps(wt574, postMul49);
wt575 = _mm512_mul_ps(wt575, postMul49);
wt576 = _mm512_mul_ps(wt576, postMul49);
wt577 = _mm512_mul_ps(wt577, postMul49);
wt578 = _mm512_mul_ps(wt578, postMul49);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(1+16*c46)+(ptrdiff_t)0, 63>>cut21, wt563);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(2+16*c46)+(ptrdiff_t)0, 63>>cut21, wt564);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(3+16*c46)+(ptrdiff_t)0, 63>>cut21, wt565);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(4+16*c46)+(ptrdiff_t)0, 63>>cut21, wt566);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(5+16*c46)+(ptrdiff_t)0, 63>>cut21, wt567);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(6+16*c46)+(ptrdiff_t)0, 63>>cut21, wt568);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(7+16*c46)+(ptrdiff_t)0, 63>>cut21, wt569);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(8+16*c46)+(ptrdiff_t)0, 63>>cut21, wt570);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(9+16*c46)+(ptrdiff_t)0, 63>>cut21, wt571);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(10+16*c46)+(ptrdiff_t)0, 63>>cut21, wt572);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(11+16*c46)+(ptrdiff_t)0, 63>>cut21, wt573);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(12+16*c46)+(ptrdiff_t)0, 63>>cut21, wt574);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(13+16*c46)+(ptrdiff_t)0, 63>>cut21, wt575);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(14+16*c46)+(ptrdiff_t)0, 63>>cut21, wt576);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(15+16*c46)+(ptrdiff_t)0, 63>>cut21, wt577);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(16+16*c46)+(ptrdiff_t)0, 63>>cut21, wt578);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(1+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt563);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(2+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt564);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(3+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt565);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(4+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt566);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(5+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt567);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(6+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt568);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(7+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt569);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(8+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt570);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(9+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt571);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(10+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt572);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(11+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt573);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(12+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt574);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(13+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt575);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(14+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt576);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(15+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt577);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(16+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt578);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(1+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt563);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(2+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt564);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(3+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt565);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(4+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt566);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(5+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt567);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(6+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt568);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(7+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt569);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(8+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt570);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(9+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt571);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(10+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt572);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(11+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt573);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(12+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt574);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(13+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt575);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(14+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt576);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(15+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt577);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(16+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt578);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(1+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt563);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(2+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt564);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(3+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt565);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(4+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt566);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(5+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt567);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(6+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt568);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(7+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt569);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(8+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt570);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(9+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt571);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(10+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt572);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(11+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt573);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(12+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt574);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(13+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt575);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(14+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt576);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(15+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt577);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(16+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt578);
}
}
}
} else {
ptrdiff_t k147 = 496;
ptrdiff_t l62 = (size_t)(0+k147)/6;
ptrdiff_t cut20 = (size_t)(0+k147)%6;
__m512 sum453 = _mm512_maskz_loadu_ps(65535, biasPtr15+2048*i56+4*k147);
__m512i pmMul33 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd33 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo28 = _mm512_loadu_ps(bnPtr16+(ptrdiff_t)8*(k147+512*i56));
__m512 masHi28 = _mm512_maskz_loadu_ps(65535, bnPtr16+(ptrdiff_t)8*(k147+512*i56)+(ptrdiff_t)64);
__m512 postMul47 = _mm512_permutex2var_ps(masLo28, pmMul33, masHi28);
__m512 postAdd31 = _mm512_permutex2var_ps(masLo28, pmAdd33, masHi28);
sum453 = _mm512_fmadd_ps(sum453, postMul47, postAdd31);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*0+(ptrdiff_t)0, 63>>cut20, sum453);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*0+(ptrdiff_t)12288, 4032>>cut20, sum453);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*0+(ptrdiff_t)24576, 258048>>cut20, sum453);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*0+(ptrdiff_t)36864, 65535-(262143>>cut20), sum453);
ptrdiff_t c44 = 0;
for (; c44 != 32; ++c44) {
__m512 wt531 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)0);
__m512 wt532 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)2048);
__m512 wt533 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)4096);
__m512 wt534 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)6144);
__m512 wt535 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)8192);
__m512 wt536 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)10240);
__m512 wt537 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)12288);
__m512 wt538 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)14336);
__m512 wt539 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)16384);
__m512 wt540 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)18432);
__m512 wt541 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)20480);
__m512 wt542 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)22528);
__m512 wt543 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)24576);
__m512 wt544 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)26624);
__m512 wt545 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)28672);
__m512 wt546 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)30720);
__m512 tmp13939 = _mm512_unpacklo_ps(wt531, wt532);
__m512 tmp13940 = _mm512_unpackhi_ps(wt531, wt532);
__m512 tmp13941 = _mm512_unpacklo_ps(wt533, wt534);
__m512 tmp13942 = _mm512_unpackhi_ps(wt533, wt534);
__m512 tmp13943 = _mm512_unpacklo_ps(wt535, wt536);
__m512 tmp13944 = _mm512_unpackhi_ps(wt535, wt536);
__m512 tmp13945 = _mm512_unpacklo_ps(wt537, wt538);
__m512 tmp13946 = _mm512_unpackhi_ps(wt537, wt538);
__m512 tmp13947 = _mm512_unpacklo_ps(wt539, wt540);
__m512 tmp13948 = _mm512_unpackhi_ps(wt539, wt540);
__m512 tmp13949 = _mm512_unpacklo_ps(wt541, wt542);
__m512 tmp13950 = _mm512_unpackhi_ps(wt541, wt542);
__m512 tmp13951 = _mm512_unpacklo_ps(wt543, wt544);
__m512 tmp13952 = _mm512_unpackhi_ps(wt543, wt544);
__m512 tmp13953 = _mm512_unpacklo_ps(wt545, wt546);
__m512 tmp13954 = _mm512_unpackhi_ps(wt545, wt546);
__m512 tmp13955 = _mm512_shuffle_ps(tmp13939, tmp13941, 68);
__m512 tmp13956 = _mm512_shuffle_ps(tmp13939, tmp13941, 238);
__m512 tmp13957 = _mm512_shuffle_ps(tmp13940, tmp13942, 68);
__m512 tmp13958 = _mm512_shuffle_ps(tmp13940, tmp13942, 238);
__m512 tmp13959 = _mm512_shuffle_ps(tmp13943, tmp13945, 68);
__m512 tmp13960 = _mm512_shuffle_ps(tmp13943, tmp13945, 238);
__m512 tmp13961 = _mm512_shuffle_ps(tmp13944, tmp13946, 68);
__m512 tmp13962 = _mm512_shuffle_ps(tmp13944, tmp13946, 238);
__m512 tmp13963 = _mm512_shuffle_ps(tmp13947, tmp13949, 68);
__m512 tmp13964 = _mm512_shuffle_ps(tmp13947, tmp13949, 238);
__m512 tmp13965 = _mm512_shuffle_ps(tmp13948, tmp13950, 68);
__m512 tmp13966 = _mm512_shuffle_ps(tmp13948, tmp13950, 238);
__m512 tmp13967 = _mm512_shuffle_ps(tmp13951, tmp13953, 68);
__m512 tmp13968 = _mm512_shuffle_ps(tmp13951, tmp13953, 238);
__m512 tmp13969 = _mm512_shuffle_ps(tmp13952, tmp13954, 68);
__m512 tmp13970 = _mm512_shuffle_ps(tmp13952, tmp13954, 238);
__m512 tmp13971 = _mm512_shuffle_f32x4(tmp13955, tmp13959, 136);
__m512 tmp13972 = _mm512_shuffle_f32x4(tmp13955, tmp13959, 221);
__m512 tmp13973 = _mm512_shuffle_f32x4(tmp13956, tmp13960, 136);
__m512 tmp13974 = _mm512_shuffle_f32x4(tmp13956, tmp13960, 221);
__m512 tmp13975 = _mm512_shuffle_f32x4(tmp13957, tmp13961, 136);
__m512 tmp13976 = _mm512_shuffle_f32x4(tmp13957, tmp13961, 221);
__m512 tmp13977 = _mm512_shuffle_f32x4(tmp13958, tmp13962, 136);
__m512 tmp13978 = _mm512_shuffle_f32x4(tmp13958, tmp13962, 221);
__m512 tmp13979 = _mm512_shuffle_f32x4(tmp13963, tmp13967, 136);
__m512 tmp13980 = _mm512_shuffle_f32x4(tmp13963, tmp13967, 221);
__m512 tmp13981 = _mm512_shuffle_f32x4(tmp13964, tmp13968, 136);
__m512 tmp13982 = _mm512_shuffle_f32x4(tmp13964, tmp13968, 221);
__m512 tmp13983 = _mm512_shuffle_f32x4(tmp13965, tmp13969, 136);
__m512 tmp13984 = _mm512_shuffle_f32x4(tmp13965, tmp13969, 221);
__m512 tmp13985 = _mm512_shuffle_f32x4(tmp13966, tmp13970, 136);
__m512 tmp13986 = _mm512_shuffle_f32x4(tmp13966, tmp13970, 221);
wt531 = _mm512_shuffle_f32x4(tmp13971, tmp13979, 136);
wt539 = _mm512_shuffle_f32x4(tmp13971, tmp13979, 221);
wt532 = _mm512_shuffle_f32x4(tmp13973, tmp13981, 136);
wt540 = _mm512_shuffle_f32x4(tmp13973, tmp13981, 221);
wt533 = _mm512_shuffle_f32x4(tmp13975, tmp13983, 136);
wt541 = _mm512_shuffle_f32x4(tmp13975, tmp13983, 221);
wt534 = _mm512_shuffle_f32x4(tmp13977, tmp13985, 136);
wt542 = _mm512_shuffle_f32x4(tmp13977, tmp13985, 221);
wt535 = _mm512_shuffle_f32x4(tmp13972, tmp13980, 136);
wt543 = _mm512_shuffle_f32x4(tmp13972, tmp13980, 221);
wt536 = _mm512_shuffle_f32x4(tmp13974, tmp13982, 136);
wt544 = _mm512_shuffle_f32x4(tmp13974, tmp13982, 221);
wt537 = _mm512_shuffle_f32x4(tmp13976, tmp13984, 136);
wt545 = _mm512_shuffle_f32x4(tmp13976, tmp13984, 221);
wt538 = _mm512_shuffle_f32x4(tmp13978, tmp13986, 136);
wt546 = _mm512_shuffle_f32x4(tmp13978, tmp13986, 221);
wt531 = _mm512_mul_ps(wt531, postMul47);
wt532 = _mm512_mul_ps(wt532, postMul47);
wt533 = _mm512_mul_ps(wt533, postMul47);
wt534 = _mm512_mul_ps(wt534, postMul47);
wt535 = _mm512_mul_ps(wt535, postMul47);
wt536 = _mm512_mul_ps(wt536, postMul47);
wt537 = _mm512_mul_ps(wt537, postMul47);
wt538 = _mm512_mul_ps(wt538, postMul47);
wt539 = _mm512_mul_ps(wt539, postMul47);
wt540 = _mm512_mul_ps(wt540, postMul47);
wt541 = _mm512_mul_ps(wt541, postMul47);
wt542 = _mm512_mul_ps(wt542, postMul47);
wt543 = _mm512_mul_ps(wt543, postMul47);
wt544 = _mm512_mul_ps(wt544, postMul47);
wt545 = _mm512_mul_ps(wt545, postMul47);
wt546 = _mm512_mul_ps(wt546, postMul47);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(1+16*c44)+(ptrdiff_t)0, 63>>cut20, wt531);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(2+16*c44)+(ptrdiff_t)0, 63>>cut20, wt532);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(3+16*c44)+(ptrdiff_t)0, 63>>cut20, wt533);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(4+16*c44)+(ptrdiff_t)0, 63>>cut20, wt534);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(5+16*c44)+(ptrdiff_t)0, 63>>cut20, wt535);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(6+16*c44)+(ptrdiff_t)0, 63>>cut20, wt536);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(7+16*c44)+(ptrdiff_t)0, 63>>cut20, wt537);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(8+16*c44)+(ptrdiff_t)0, 63>>cut20, wt538);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(9+16*c44)+(ptrdiff_t)0, 63>>cut20, wt539);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(10+16*c44)+(ptrdiff_t)0, 63>>cut20, wt540);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(11+16*c44)+(ptrdiff_t)0, 63>>cut20, wt541);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(12+16*c44)+(ptrdiff_t)0, 63>>cut20, wt542);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(13+16*c44)+(ptrdiff_t)0, 63>>cut20, wt543);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(14+16*c44)+(ptrdiff_t)0, 63>>cut20, wt544);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(15+16*c44)+(ptrdiff_t)0, 63>>cut20, wt545);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(16+16*c44)+(ptrdiff_t)0, 63>>cut20, wt546);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(1+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt531);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(2+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt532);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(3+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt533);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(4+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt534);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(5+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt535);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(6+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt536);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(7+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt537);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(8+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt538);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(9+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt539);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(10+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt540);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(11+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt541);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(12+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt542);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(13+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt543);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(14+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt544);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(15+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt545);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(16+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt546);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(1+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt531);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(2+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt532);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(3+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt533);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(4+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt534);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(5+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt535);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(6+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt536);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(7+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt537);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(8+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt538);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(9+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt539);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(10+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt540);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(11+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt541);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(12+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt542);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(13+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt543);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(14+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt544);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(15+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt545);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(16+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt546);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(1+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt531);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(2+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt532);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(3+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt533);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(4+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt534);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(5+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt535);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(6+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt536);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(7+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt537);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(8+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt538);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(9+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt539);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(10+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt540);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(11+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt541);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(12+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt542);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(13+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt543);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(14+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt544);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(15+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt545);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(16+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt546);
}
}
}
}
}

static void ResNeXt50OneArrangeWts9(ResNeXt50ThreaderTeam1* team59, char** tensors91) {
ResNeXt50ThreaderTask1 task95;
task95.callee1 = ResNeXt50OneArrangeWts9Callee1;
task95.any1 = tensors91;
task95.nd1 = 3;
task95.hull1[0] = 32;
task95.hull1[1] = 1;
task95.hull1[2] = 1;
ResNeXt50ThreaderDo1(team59, &task95);
}

static void ResNeXt50OneArrangeDats9Callee1(ResNeXt50ThreaderTask1* task96, int64_t* pt53) {
char** tensors94 = task96->any1;
ptrdiff_t s52 = pt53[0];
ptrdiff_t c47 = pt53[1];
char*restrict datPtr29 = tensors94[0]+(ptrdiff_t)0+(ptrdiff_t)2618560*0+(ptrdiff_t)1605632*0;
char*restrict arranged18 = tensors94[1]+(ptrdiff_t)2618560*0+(ptrdiff_t)1605632*0;
ptrdiff_t ii42 = 1;
for (ptrdiff_t i57 = 0; i57 < ii42; ++i57) {
ptrdiff_t j49 = 1*c47;
ptrdiff_t jj49 = j49+0;
for (; j49 != 12; ++j49) {
ptrdiff_t k149 = 128*s52;
ptrdiff_t kk49 = k149+128;
for (; k149 < kk49; ++k149) {
__m512 dat2349 = _mm512_maskz_loadu_ps(65535, datPtr29+1605632*i57+256*j49+3136*k149+(ptrdiff_t)0);
__m512 dat2350 = _mm512_maskz_loadu_ps(65535, datPtr29+1605632*i57+256*j49+3136*k149+(ptrdiff_t)64);
__m512 dat2351 = _mm512_maskz_loadu_ps(65535, datPtr29+1605632*i57+256*j49+3136*k149+(ptrdiff_t)128);
__m512 dat2352 = _mm512_maskz_loadu_ps(65535, datPtr29+1605632*i57+256*j49+3136*k149+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged18+1605632*i57+131072*j49+256*k149+(ptrdiff_t)0, 65535, dat2349);
_mm512_mask_storeu_ps(arranged18+1605632*i57+131072*j49+256*k149+(ptrdiff_t)64, 65535, dat2350);
_mm512_mask_storeu_ps(arranged18+1605632*i57+131072*j49+256*k149+(ptrdiff_t)128, 65535, dat2351);
_mm512_mask_storeu_ps(arranged18+1605632*i57+131072*j49+256*k149+(ptrdiff_t)192, 65535, dat2352);
}
if (j49 >= jj49) goto next9;
}
ptrdiff_t k150 = 128*s52;
ptrdiff_t kk50 = k150+128;
for (; k150 < kk50; ++k150) {
__m512 dat2353 = _mm512_maskz_loadu_ps(65535, datPtr29+1605632*i57+256*j49+3136*k150+(ptrdiff_t)0);
_mm512_mask_storeu_ps(arranged18+1605632*i57+131072*j49+64*k150+(ptrdiff_t)0, 65535, dat2353);
}
next9:;
}
}

static void ResNeXt50OneArrangeDats9(ResNeXt50ThreaderTeam1* team60, char** tensors93) {
ResNeXt50ThreaderTask1 task97;
task97.callee1 = ResNeXt50OneArrangeDats9Callee1;
task97.any1 = tensors93;
task97.nd1 = 4;
task97.hull1[0] = 4;
task97.hull1[1] = 13;
task97.hull1[2] = 1;
task97.hull1[3] = 1;
ResNeXt50ThreaderDo1(team60, &task97);
}

static void ResNeXt50OneApply9Callee1(ResNeXt50ThreaderTask1* task98, int64_t* pt54) {
void** pair24 = task98->any1;
char** tensors96 = pair24[0];
ptrdiff_t e27 = 0;
ptrdiff_t g30 = 0;
ptrdiff_t d19 = pt54[1];
ptrdiff_t w64 = pt54[0];
char*restrict arrangedWts9 = tensors96[0]+1712128*e27+(ptrdiff_t)1050624*1*g30;
char*restrict arrangedDats9 = tensors96[1]+2618560*e27+(ptrdiff_t)1605632*1*g30;
char*restrict datPtr30 = tensors96[2]+(ptrdiff_t)1605632*1*g30;
ptrdiff_t ii43 = 1;
for (ptrdiff_t i58 = 0; i58 < ii43; ++i58) {
ptrdiff_t j50 = 1*d19;
ptrdiff_t jj50 = j50+0;
for (; j50 != 12; ++j50) {
ptrdiff_t k151 = 1*w64;
ptrdiff_t kk51 = k151+0;
for (; k151 != 85; ++k151) {
ptrdiff_t s53 = -1;
__m512 sum456 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)24));
__m512 sum460 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)28));
__m512 sum464 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)32));
__m512 sum468 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)36));
__m512 sum472 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)40));
__m512 sum476 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)44));
__m512 sum457 = sum456;
__m512 sum458 = sum456;
__m512 sum459 = sum456;
__m512 sum461 = sum460;
__m512 sum462 = sum460;
__m512 sum463 = sum460;
__m512 sum465 = sum464;
__m512 sum466 = sum464;
__m512 sum467 = sum464;
__m512 sum469 = sum468;
__m512 sum470 = sum468;
__m512 sum471 = sum468;
__m512 sum473 = sum472;
__m512 sum474 = sum472;
__m512 sum475 = sum472;
__m512 sum477 = sum476;
__m512 sum478 = sum476;
__m512 sum479 = sum476;
for (s53 = 0; s53 < 512; ++s53) {
__m512 dat2354 = _mm512_loadu_ps(arrangedDats9+1605632*i58+131072*j50+256*s53+(ptrdiff_t)0);
__m512 dat2355 = _mm512_loadu_ps(arrangedDats9+1605632*i58+131072*j50+256*s53+(ptrdiff_t)64);
__m512 dat2356 = _mm512_loadu_ps(arrangedDats9+1605632*i58+131072*j50+256*s53+(ptrdiff_t)128);
__m512 dat2357 = _mm512_loadu_ps(arrangedDats9+1605632*i58+131072*j50+256*s53+(ptrdiff_t)192);
__m512 wt579 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)24));
sum456 = _mm512_fmadd_ps(wt579, dat2354, sum456);
sum457 = _mm512_fmadd_ps(wt579, dat2355, sum457);
sum458 = _mm512_fmadd_ps(wt579, dat2356, sum458);
sum459 = _mm512_fmadd_ps(wt579, dat2357, sum459);
__m512 wt580 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)28));
sum460 = _mm512_fmadd_ps(wt580, dat2354, sum460);
sum461 = _mm512_fmadd_ps(wt580, dat2355, sum461);
sum462 = _mm512_fmadd_ps(wt580, dat2356, sum462);
sum463 = _mm512_fmadd_ps(wt580, dat2357, sum463);
__m512 wt581 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)32));
sum464 = _mm512_fmadd_ps(wt581, dat2354, sum464);
sum465 = _mm512_fmadd_ps(wt581, dat2355, sum465);
sum466 = _mm512_fmadd_ps(wt581, dat2356, sum466);
sum467 = _mm512_fmadd_ps(wt581, dat2357, sum467);
__m512 wt582 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)36));
sum468 = _mm512_fmadd_ps(wt582, dat2354, sum468);
sum469 = _mm512_fmadd_ps(wt582, dat2355, sum469);
sum470 = _mm512_fmadd_ps(wt582, dat2356, sum470);
sum471 = _mm512_fmadd_ps(wt582, dat2357, sum471);
__m512 wt583 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)40));
sum472 = _mm512_fmadd_ps(wt583, dat2354, sum472);
sum473 = _mm512_fmadd_ps(wt583, dat2355, sum473);
sum474 = _mm512_fmadd_ps(wt583, dat2356, sum474);
sum475 = _mm512_fmadd_ps(wt583, dat2357, sum475);
__m512 wt584 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)44));
sum476 = _mm512_fmadd_ps(wt584, dat2354, sum476);
sum477 = _mm512_fmadd_ps(wt584, dat2355, sum477);
sum478 = _mm512_fmadd_ps(wt584, dat2356, sum478);
sum479 = _mm512_fmadd_ps(wt584, dat2357, sum479);
}
sum456 = _mm512_max_ps(_mm512_setzero_ps(), sum456);
sum457 = _mm512_max_ps(_mm512_setzero_ps(), sum457);
sum458 = _mm512_max_ps(_mm512_setzero_ps(), sum458);
sum459 = _mm512_max_ps(_mm512_setzero_ps(), sum459);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)0, 65535, sum456);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)64, 65535, sum457);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)128, 65535, sum458);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)192, 65535, sum459);
sum460 = _mm512_max_ps(_mm512_setzero_ps(), sum460);
sum461 = _mm512_max_ps(_mm512_setzero_ps(), sum461);
sum462 = _mm512_max_ps(_mm512_setzero_ps(), sum462);
sum463 = _mm512_max_ps(_mm512_setzero_ps(), sum463);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)3136, 65535, sum460);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)3200, 65535, sum461);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)3264, 65535, sum462);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)3328, 65535, sum463);
sum464 = _mm512_max_ps(_mm512_setzero_ps(), sum464);
sum465 = _mm512_max_ps(_mm512_setzero_ps(), sum465);
sum466 = _mm512_max_ps(_mm512_setzero_ps(), sum466);
sum467 = _mm512_max_ps(_mm512_setzero_ps(), sum467);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)6272, 65535, sum464);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)6336, 65535, sum465);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)6400, 65535, sum466);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)6464, 65535, sum467);
sum468 = _mm512_max_ps(_mm512_setzero_ps(), sum468);
sum469 = _mm512_max_ps(_mm512_setzero_ps(), sum469);
sum470 = _mm512_max_ps(_mm512_setzero_ps(), sum470);
sum471 = _mm512_max_ps(_mm512_setzero_ps(), sum471);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)9408, 65535, sum468);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)9472, 65535, sum469);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)9536, 65535, sum470);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)9600, 65535, sum471);
sum472 = _mm512_max_ps(_mm512_setzero_ps(), sum472);
sum473 = _mm512_max_ps(_mm512_setzero_ps(), sum473);
sum474 = _mm512_max_ps(_mm512_setzero_ps(), sum474);
sum475 = _mm512_max_ps(_mm512_setzero_ps(), sum475);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)12544, 65535, sum472);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)12608, 65535, sum473);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)12672, 65535, sum474);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)12736, 65535, sum475);
sum476 = _mm512_max_ps(_mm512_setzero_ps(), sum476);
sum477 = _mm512_max_ps(_mm512_setzero_ps(), sum477);
sum478 = _mm512_max_ps(_mm512_setzero_ps(), sum478);
sum479 = _mm512_max_ps(_mm512_setzero_ps(), sum479);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)15680, 65535, sum476);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)15744, 65535, sum477);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)15808, 65535, sum478);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)15872, 65535, sum479);
if (k151 >= kk51) return;
}
ptrdiff_t s54 = -1;
__m512 sum480 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+8*s54+(ptrdiff_t)8));
__m512 sum484 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+8*s54+(ptrdiff_t)12));
__m512 sum481 = sum480;
__m512 sum482 = sum480;
__m512 sum483 = sum480;
__m512 sum485 = sum484;
__m512 sum486 = sum484;
__m512 sum487 = sum484;
for (s54 = 0; s54 < 512; ++s54) {
__m512 dat2358 = _mm512_loadu_ps(arrangedDats9+1605632*i58+131072*j50+256*s54+(ptrdiff_t)0);
__m512 dat2359 = _mm512_loadu_ps(arrangedDats9+1605632*i58+131072*j50+256*s54+(ptrdiff_t)64);
__m512 dat2360 = _mm512_loadu_ps(arrangedDats9+1605632*i58+131072*j50+256*s54+(ptrdiff_t)128);
__m512 dat2361 = _mm512_loadu_ps(arrangedDats9+1605632*i58+131072*j50+256*s54+(ptrdiff_t)192);
__m512 wt585 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+8*s54+(ptrdiff_t)8));
sum480 = _mm512_fmadd_ps(wt585, dat2358, sum480);
sum481 = _mm512_fmadd_ps(wt585, dat2359, sum481);
sum482 = _mm512_fmadd_ps(wt585, dat2360, sum482);
sum483 = _mm512_fmadd_ps(wt585, dat2361, sum483);
__m512 wt586 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+8*s54+(ptrdiff_t)12));
sum484 = _mm512_fmadd_ps(wt586, dat2358, sum484);
sum485 = _mm512_fmadd_ps(wt586, dat2359, sum485);
sum486 = _mm512_fmadd_ps(wt586, dat2360, sum486);
sum487 = _mm512_fmadd_ps(wt586, dat2361, sum487);
}
sum480 = _mm512_max_ps(_mm512_setzero_ps(), sum480);
sum481 = _mm512_max_ps(_mm512_setzero_ps(), sum481);
sum482 = _mm512_max_ps(_mm512_setzero_ps(), sum482);
sum483 = _mm512_max_ps(_mm512_setzero_ps(), sum483);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)0, 65535, sum480);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)64, 65535, sum481);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)128, 65535, sum482);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)192, 65535, sum483);
sum484 = _mm512_max_ps(_mm512_setzero_ps(), sum484);
sum485 = _mm512_max_ps(_mm512_setzero_ps(), sum485);
sum486 = _mm512_max_ps(_mm512_setzero_ps(), sum486);
sum487 = _mm512_max_ps(_mm512_setzero_ps(), sum487);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)3136, 65535, sum484);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)3200, 65535, sum485);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)3264, 65535, sum486);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)3328, 65535, sum487);
if (j50 >= jj50) return;
}
ptrdiff_t k152 = 1*w64;
ptrdiff_t kk52 = k152+0;
for (; k152 != 85; ++k152) {
ptrdiff_t s55 = -1;
__m512 sum488 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)24));
__m512 sum489 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)28));
__m512 sum490 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)32));
__m512 sum491 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)36));
__m512 sum492 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)40));
__m512 sum493 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)44));
for (s55 = 0; s55 < 512; ++s55) {
__m512 dat2362 = _mm512_loadu_ps(arrangedDats9+1605632*i58+131072*j50+64*s55+(ptrdiff_t)0);
__m512 wt587 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)24));
sum488 = _mm512_fmadd_ps(wt587, dat2362, sum488);
__m512 wt588 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)28));
sum489 = _mm512_fmadd_ps(wt588, dat2362, sum489);
__m512 wt589 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)32));
sum490 = _mm512_fmadd_ps(wt589, dat2362, sum490);
__m512 wt590 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)36));
sum491 = _mm512_fmadd_ps(wt590, dat2362, sum491);
__m512 wt591 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)40));
sum492 = _mm512_fmadd_ps(wt591, dat2362, sum492);
__m512 wt592 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)44));
sum493 = _mm512_fmadd_ps(wt592, dat2362, sum493);
}
sum488 = _mm512_max_ps(_mm512_setzero_ps(), sum488);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k152+(ptrdiff_t)0, 65535, sum488);
sum489 = _mm512_max_ps(_mm512_setzero_ps(), sum489);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k152+(ptrdiff_t)3136, 65535, sum489);
sum490 = _mm512_max_ps(_mm512_setzero_ps(), sum490);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k152+(ptrdiff_t)6272, 65535, sum490);
sum491 = _mm512_max_ps(_mm512_setzero_ps(), sum491);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k152+(ptrdiff_t)9408, 65535, sum491);
sum492 = _mm512_max_ps(_mm512_setzero_ps(), sum492);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k152+(ptrdiff_t)12544, 65535, sum492);
sum493 = _mm512_max_ps(_mm512_setzero_ps(), sum493);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k152+(ptrdiff_t)15680, 65535, sum493);
if (k152 >= kk52) return;
}
ptrdiff_t s56 = -1;
__m512 sum494 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+8*s56+(ptrdiff_t)8));
__m512 sum495 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+8*s56+(ptrdiff_t)12));
for (s56 = 0; s56 < 512; ++s56) {
__m512 dat2363 = _mm512_loadu_ps(arrangedDats9+1605632*i58+131072*j50+64*s56+(ptrdiff_t)0);
__m512 wt593 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+8*s56+(ptrdiff_t)8));
sum494 = _mm512_fmadd_ps(wt593, dat2363, sum494);
__m512 wt594 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+8*s56+(ptrdiff_t)12));
sum495 = _mm512_fmadd_ps(wt594, dat2363, sum495);
}
sum494 = _mm512_max_ps(_mm512_setzero_ps(), sum494);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k152+(ptrdiff_t)0, 65535, sum494);
sum495 = _mm512_max_ps(_mm512_setzero_ps(), sum495);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k152+(ptrdiff_t)3136, 65535, sum495);
}
}

static void ResNeXt50OneApply9(ResNeXt50ThreaderTeam1* team61, char** tensors95) {
void* pair23[] = {tensors95, 0};
ResNeXt50ThreaderTask1 task99;
task99.callee1 = ResNeXt50OneApply9Callee1;
task99.any1 = pair23;
task99.nd1 = 3;
task99.hull1[0] = 86;
task99.hull1[1] = 13;
task99.hull1[2] = 1;
ResNeXt50ThreaderDo1(team61, &task99);
}

static void ResNeXt50OneArrangeWts10Callee1(ResNeXt50ThreaderTask1* task108, int64_t* pt59) {
char** tensors106 = task108->any1;
ptrdiff_t b81 = pt59[0];
char*restrict wtPtr17 = tensors106[0]+(ptrdiff_t)3340*0+(ptrdiff_t)2097152*0;
char*restrict biasPtr17 = tensors106[1]+(ptrdiff_t)4096*0;
char*restrict bnPtr18 = tensors106[2]+(ptrdiff_t)8*1024*0;
char*restrict arranged19 = tensors106[3]+(ptrdiff_t)3424256*0+(ptrdiff_t)2101248*0;
ptrdiff_t ii46 = 1;
for (ptrdiff_t i64 = 0; i64 < ii46; ++i64) {
ptrdiff_t j56 = 1*b81;
ptrdiff_t jj54 = j56+1;
for (; j56 < jj54; ++j56) {
if (j56 < 63) {
ptrdiff_t k161 = 0+16*(j56-0);
ptrdiff_t l69 = (size_t)(0+k161)/6;
ptrdiff_t cut23 = (size_t)(0+k161)%6;
switch (cut23) {
case 0:;
case 2: {
__m512 sum497 = _mm512_maskz_loadu_ps(65535, biasPtr17+4096*i64+4*k161);
__m512i pmMul35 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd35 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo29 = _mm512_loadu_ps(bnPtr18+(ptrdiff_t)8*(k161+1024*i64));
__m512 masHi29 = _mm512_maskz_loadu_ps(65535, bnPtr18+(ptrdiff_t)8*(k161+1024*i64)+(ptrdiff_t)64);
__m512 postMul54 = _mm512_permutex2var_ps(masLo29, pmMul35, masHi29);
__m512 postAdd36 = _mm512_permutex2var_ps(masLo29, pmAdd35, masHi29);
sum497 = _mm512_fmadd_ps(sum497, postMul54, postAdd36);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*0+(ptrdiff_t)0, 63>>cut23, sum497);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*0+(ptrdiff_t)12288, 4032>>cut23, sum497);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*0+(ptrdiff_t)24576, 65535-(4095>>cut23), sum497);
ptrdiff_t c52 = 0;
for (; c52 != 32; ++c52) {
__m512 wt617 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)0);
__m512 wt618 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)2048);
__m512 wt619 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)4096);
__m512 wt620 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)6144);
__m512 wt621 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)8192);
__m512 wt622 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)10240);
__m512 wt623 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)12288);
__m512 wt624 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)14336);
__m512 wt625 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)16384);
__m512 wt626 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)18432);
__m512 wt627 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)20480);
__m512 wt628 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)22528);
__m512 wt629 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)24576);
__m512 wt630 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)26624);
__m512 wt631 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)28672);
__m512 wt632 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)30720);
__m512 tmp13987 = _mm512_unpacklo_ps(wt617, wt618);
__m512 tmp13988 = _mm512_unpackhi_ps(wt617, wt618);
__m512 tmp13989 = _mm512_unpacklo_ps(wt619, wt620);
__m512 tmp13990 = _mm512_unpackhi_ps(wt619, wt620);
__m512 tmp13991 = _mm512_unpacklo_ps(wt621, wt622);
__m512 tmp13992 = _mm512_unpackhi_ps(wt621, wt622);
__m512 tmp13993 = _mm512_unpacklo_ps(wt623, wt624);
__m512 tmp13994 = _mm512_unpackhi_ps(wt623, wt624);
__m512 tmp13995 = _mm512_unpacklo_ps(wt625, wt626);
__m512 tmp13996 = _mm512_unpackhi_ps(wt625, wt626);
__m512 tmp13997 = _mm512_unpacklo_ps(wt627, wt628);
__m512 tmp13998 = _mm512_unpackhi_ps(wt627, wt628);
__m512 tmp13999 = _mm512_unpacklo_ps(wt629, wt630);
__m512 tmp14000 = _mm512_unpackhi_ps(wt629, wt630);
__m512 tmp14001 = _mm512_unpacklo_ps(wt631, wt632);
__m512 tmp14002 = _mm512_unpackhi_ps(wt631, wt632);
__m512 tmp14003 = _mm512_shuffle_ps(tmp13987, tmp13989, 68);
__m512 tmp14004 = _mm512_shuffle_ps(tmp13987, tmp13989, 238);
__m512 tmp14005 = _mm512_shuffle_ps(tmp13988, tmp13990, 68);
__m512 tmp14006 = _mm512_shuffle_ps(tmp13988, tmp13990, 238);
__m512 tmp14007 = _mm512_shuffle_ps(tmp13991, tmp13993, 68);
__m512 tmp14008 = _mm512_shuffle_ps(tmp13991, tmp13993, 238);
__m512 tmp14009 = _mm512_shuffle_ps(tmp13992, tmp13994, 68);
__m512 tmp14010 = _mm512_shuffle_ps(tmp13992, tmp13994, 238);
__m512 tmp14011 = _mm512_shuffle_ps(tmp13995, tmp13997, 68);
__m512 tmp14012 = _mm512_shuffle_ps(tmp13995, tmp13997, 238);
__m512 tmp14013 = _mm512_shuffle_ps(tmp13996, tmp13998, 68);
__m512 tmp14014 = _mm512_shuffle_ps(tmp13996, tmp13998, 238);
__m512 tmp14015 = _mm512_shuffle_ps(tmp13999, tmp14001, 68);
__m512 tmp14016 = _mm512_shuffle_ps(tmp13999, tmp14001, 238);
__m512 tmp14017 = _mm512_shuffle_ps(tmp14000, tmp14002, 68);
__m512 tmp14018 = _mm512_shuffle_ps(tmp14000, tmp14002, 238);
__m512 tmp14019 = _mm512_shuffle_f32x4(tmp14003, tmp14007, 136);
__m512 tmp14020 = _mm512_shuffle_f32x4(tmp14003, tmp14007, 221);
__m512 tmp14021 = _mm512_shuffle_f32x4(tmp14004, tmp14008, 136);
__m512 tmp14022 = _mm512_shuffle_f32x4(tmp14004, tmp14008, 221);
__m512 tmp14023 = _mm512_shuffle_f32x4(tmp14005, tmp14009, 136);
__m512 tmp14024 = _mm512_shuffle_f32x4(tmp14005, tmp14009, 221);
__m512 tmp14025 = _mm512_shuffle_f32x4(tmp14006, tmp14010, 136);
__m512 tmp14026 = _mm512_shuffle_f32x4(tmp14006, tmp14010, 221);
__m512 tmp14027 = _mm512_shuffle_f32x4(tmp14011, tmp14015, 136);
__m512 tmp14028 = _mm512_shuffle_f32x4(tmp14011, tmp14015, 221);
__m512 tmp14029 = _mm512_shuffle_f32x4(tmp14012, tmp14016, 136);
__m512 tmp14030 = _mm512_shuffle_f32x4(tmp14012, tmp14016, 221);
__m512 tmp14031 = _mm512_shuffle_f32x4(tmp14013, tmp14017, 136);
__m512 tmp14032 = _mm512_shuffle_f32x4(tmp14013, tmp14017, 221);
__m512 tmp14033 = _mm512_shuffle_f32x4(tmp14014, tmp14018, 136);
__m512 tmp14034 = _mm512_shuffle_f32x4(tmp14014, tmp14018, 221);
wt617 = _mm512_shuffle_f32x4(tmp14019, tmp14027, 136);
wt625 = _mm512_shuffle_f32x4(tmp14019, tmp14027, 221);
wt618 = _mm512_shuffle_f32x4(tmp14021, tmp14029, 136);
wt626 = _mm512_shuffle_f32x4(tmp14021, tmp14029, 221);
wt619 = _mm512_shuffle_f32x4(tmp14023, tmp14031, 136);
wt627 = _mm512_shuffle_f32x4(tmp14023, tmp14031, 221);
wt620 = _mm512_shuffle_f32x4(tmp14025, tmp14033, 136);
wt628 = _mm512_shuffle_f32x4(tmp14025, tmp14033, 221);
wt621 = _mm512_shuffle_f32x4(tmp14020, tmp14028, 136);
wt629 = _mm512_shuffle_f32x4(tmp14020, tmp14028, 221);
wt622 = _mm512_shuffle_f32x4(tmp14022, tmp14030, 136);
wt630 = _mm512_shuffle_f32x4(tmp14022, tmp14030, 221);
wt623 = _mm512_shuffle_f32x4(tmp14024, tmp14032, 136);
wt631 = _mm512_shuffle_f32x4(tmp14024, tmp14032, 221);
wt624 = _mm512_shuffle_f32x4(tmp14026, tmp14034, 136);
wt632 = _mm512_shuffle_f32x4(tmp14026, tmp14034, 221);
wt617 = _mm512_mul_ps(wt617, postMul54);
wt618 = _mm512_mul_ps(wt618, postMul54);
wt619 = _mm512_mul_ps(wt619, postMul54);
wt620 = _mm512_mul_ps(wt620, postMul54);
wt621 = _mm512_mul_ps(wt621, postMul54);
wt622 = _mm512_mul_ps(wt622, postMul54);
wt623 = _mm512_mul_ps(wt623, postMul54);
wt624 = _mm512_mul_ps(wt624, postMul54);
wt625 = _mm512_mul_ps(wt625, postMul54);
wt626 = _mm512_mul_ps(wt626, postMul54);
wt627 = _mm512_mul_ps(wt627, postMul54);
wt628 = _mm512_mul_ps(wt628, postMul54);
wt629 = _mm512_mul_ps(wt629, postMul54);
wt630 = _mm512_mul_ps(wt630, postMul54);
wt631 = _mm512_mul_ps(wt631, postMul54);
wt632 = _mm512_mul_ps(wt632, postMul54);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(1+16*c52)+(ptrdiff_t)0, 63>>cut23, wt617);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(2+16*c52)+(ptrdiff_t)0, 63>>cut23, wt618);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(3+16*c52)+(ptrdiff_t)0, 63>>cut23, wt619);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(4+16*c52)+(ptrdiff_t)0, 63>>cut23, wt620);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(5+16*c52)+(ptrdiff_t)0, 63>>cut23, wt621);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(6+16*c52)+(ptrdiff_t)0, 63>>cut23, wt622);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(7+16*c52)+(ptrdiff_t)0, 63>>cut23, wt623);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(8+16*c52)+(ptrdiff_t)0, 63>>cut23, wt624);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(9+16*c52)+(ptrdiff_t)0, 63>>cut23, wt625);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(10+16*c52)+(ptrdiff_t)0, 63>>cut23, wt626);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(11+16*c52)+(ptrdiff_t)0, 63>>cut23, wt627);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(12+16*c52)+(ptrdiff_t)0, 63>>cut23, wt628);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(13+16*c52)+(ptrdiff_t)0, 63>>cut23, wt629);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(14+16*c52)+(ptrdiff_t)0, 63>>cut23, wt630);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(15+16*c52)+(ptrdiff_t)0, 63>>cut23, wt631);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(16+16*c52)+(ptrdiff_t)0, 63>>cut23, wt632);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(1+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt617);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(2+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt618);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(3+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt619);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(4+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt620);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(5+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt621);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(6+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt622);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(7+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt623);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(8+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt624);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(9+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt625);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(10+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt626);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(11+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt627);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(12+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt628);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(13+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt629);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(14+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt630);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(15+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt631);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(16+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt632);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(1+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt617);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(2+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt618);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(3+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt619);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(4+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt620);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(5+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt621);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(6+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt622);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(7+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt623);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(8+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt624);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(9+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt625);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(10+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt626);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(11+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt627);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(12+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt628);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(13+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt629);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(14+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt630);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(15+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt631);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(16+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt632);
}
break;
}
default: {
cut23 = 4;
__m512 sum498 = _mm512_maskz_loadu_ps(65535, biasPtr17+4096*i64+4*k161);
__m512i pmMul36 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd36 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo30 = _mm512_loadu_ps(bnPtr18+(ptrdiff_t)8*(k161+1024*i64));
__m512 masHi30 = _mm512_maskz_loadu_ps(65535, bnPtr18+(ptrdiff_t)8*(k161+1024*i64)+(ptrdiff_t)64);
__m512 postMul55 = _mm512_permutex2var_ps(masLo30, pmMul36, masHi30);
__m512 postAdd37 = _mm512_permutex2var_ps(masLo30, pmAdd36, masHi30);
sum498 = _mm512_fmadd_ps(sum498, postMul55, postAdd37);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*0+(ptrdiff_t)0, 63>>cut23, sum498);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*0+(ptrdiff_t)12288, 4032>>cut23, sum498);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*0+(ptrdiff_t)24576, 258048>>cut23, sum498);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*0+(ptrdiff_t)36864, 65535-(262143>>cut23), sum498);
ptrdiff_t c53 = 0;
for (; c53 != 32; ++c53) {
__m512 wt633 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)0);
__m512 wt634 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)2048);
__m512 wt635 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)4096);
__m512 wt636 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)6144);
__m512 wt637 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)8192);
__m512 wt638 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)10240);
__m512 wt639 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)12288);
__m512 wt640 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)14336);
__m512 wt641 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)16384);
__m512 wt642 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)18432);
__m512 wt643 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)20480);
__m512 wt644 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)22528);
__m512 wt645 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)24576);
__m512 wt646 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)26624);
__m512 wt647 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)28672);
__m512 wt648 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)30720);
__m512 tmp14035 = _mm512_unpacklo_ps(wt633, wt634);
__m512 tmp14036 = _mm512_unpackhi_ps(wt633, wt634);
__m512 tmp14037 = _mm512_unpacklo_ps(wt635, wt636);
__m512 tmp14038 = _mm512_unpackhi_ps(wt635, wt636);
__m512 tmp14039 = _mm512_unpacklo_ps(wt637, wt638);
__m512 tmp14040 = _mm512_unpackhi_ps(wt637, wt638);
__m512 tmp14041 = _mm512_unpacklo_ps(wt639, wt640);
__m512 tmp14042 = _mm512_unpackhi_ps(wt639, wt640);
__m512 tmp14043 = _mm512_unpacklo_ps(wt641, wt642);
__m512 tmp14044 = _mm512_unpackhi_ps(wt641, wt642);
__m512 tmp14045 = _mm512_unpacklo_ps(wt643, wt644);
__m512 tmp14046 = _mm512_unpackhi_ps(wt643, wt644);
__m512 tmp14047 = _mm512_unpacklo_ps(wt645, wt646);
__m512 tmp14048 = _mm512_unpackhi_ps(wt645, wt646);
__m512 tmp14049 = _mm512_unpacklo_ps(wt647, wt648);
__m512 tmp14050 = _mm512_unpackhi_ps(wt647, wt648);
__m512 tmp14051 = _mm512_shuffle_ps(tmp14035, tmp14037, 68);
__m512 tmp14052 = _mm512_shuffle_ps(tmp14035, tmp14037, 238);
__m512 tmp14053 = _mm512_shuffle_ps(tmp14036, tmp14038, 68);
__m512 tmp14054 = _mm512_shuffle_ps(tmp14036, tmp14038, 238);
__m512 tmp14055 = _mm512_shuffle_ps(tmp14039, tmp14041, 68);
__m512 tmp14056 = _mm512_shuffle_ps(tmp14039, tmp14041, 238);
__m512 tmp14057 = _mm512_shuffle_ps(tmp14040, tmp14042, 68);
__m512 tmp14058 = _mm512_shuffle_ps(tmp14040, tmp14042, 238);
__m512 tmp14059 = _mm512_shuffle_ps(tmp14043, tmp14045, 68);
__m512 tmp14060 = _mm512_shuffle_ps(tmp14043, tmp14045, 238);
__m512 tmp14061 = _mm512_shuffle_ps(tmp14044, tmp14046, 68);
__m512 tmp14062 = _mm512_shuffle_ps(tmp14044, tmp14046, 238);
__m512 tmp14063 = _mm512_shuffle_ps(tmp14047, tmp14049, 68);
__m512 tmp14064 = _mm512_shuffle_ps(tmp14047, tmp14049, 238);
__m512 tmp14065 = _mm512_shuffle_ps(tmp14048, tmp14050, 68);
__m512 tmp14066 = _mm512_shuffle_ps(tmp14048, tmp14050, 238);
__m512 tmp14067 = _mm512_shuffle_f32x4(tmp14051, tmp14055, 136);
__m512 tmp14068 = _mm512_shuffle_f32x4(tmp14051, tmp14055, 221);
__m512 tmp14069 = _mm512_shuffle_f32x4(tmp14052, tmp14056, 136);
__m512 tmp14070 = _mm512_shuffle_f32x4(tmp14052, tmp14056, 221);
__m512 tmp14071 = _mm512_shuffle_f32x4(tmp14053, tmp14057, 136);
__m512 tmp14072 = _mm512_shuffle_f32x4(tmp14053, tmp14057, 221);
__m512 tmp14073 = _mm512_shuffle_f32x4(tmp14054, tmp14058, 136);
__m512 tmp14074 = _mm512_shuffle_f32x4(tmp14054, tmp14058, 221);
__m512 tmp14075 = _mm512_shuffle_f32x4(tmp14059, tmp14063, 136);
__m512 tmp14076 = _mm512_shuffle_f32x4(tmp14059, tmp14063, 221);
__m512 tmp14077 = _mm512_shuffle_f32x4(tmp14060, tmp14064, 136);
__m512 tmp14078 = _mm512_shuffle_f32x4(tmp14060, tmp14064, 221);
__m512 tmp14079 = _mm512_shuffle_f32x4(tmp14061, tmp14065, 136);
__m512 tmp14080 = _mm512_shuffle_f32x4(tmp14061, tmp14065, 221);
__m512 tmp14081 = _mm512_shuffle_f32x4(tmp14062, tmp14066, 136);
__m512 tmp14082 = _mm512_shuffle_f32x4(tmp14062, tmp14066, 221);
wt633 = _mm512_shuffle_f32x4(tmp14067, tmp14075, 136);
wt641 = _mm512_shuffle_f32x4(tmp14067, tmp14075, 221);
wt634 = _mm512_shuffle_f32x4(tmp14069, tmp14077, 136);
wt642 = _mm512_shuffle_f32x4(tmp14069, tmp14077, 221);
wt635 = _mm512_shuffle_f32x4(tmp14071, tmp14079, 136);
wt643 = _mm512_shuffle_f32x4(tmp14071, tmp14079, 221);
wt636 = _mm512_shuffle_f32x4(tmp14073, tmp14081, 136);
wt644 = _mm512_shuffle_f32x4(tmp14073, tmp14081, 221);
wt637 = _mm512_shuffle_f32x4(tmp14068, tmp14076, 136);
wt645 = _mm512_shuffle_f32x4(tmp14068, tmp14076, 221);
wt638 = _mm512_shuffle_f32x4(tmp14070, tmp14078, 136);
wt646 = _mm512_shuffle_f32x4(tmp14070, tmp14078, 221);
wt639 = _mm512_shuffle_f32x4(tmp14072, tmp14080, 136);
wt647 = _mm512_shuffle_f32x4(tmp14072, tmp14080, 221);
wt640 = _mm512_shuffle_f32x4(tmp14074, tmp14082, 136);
wt648 = _mm512_shuffle_f32x4(tmp14074, tmp14082, 221);
wt633 = _mm512_mul_ps(wt633, postMul55);
wt634 = _mm512_mul_ps(wt634, postMul55);
wt635 = _mm512_mul_ps(wt635, postMul55);
wt636 = _mm512_mul_ps(wt636, postMul55);
wt637 = _mm512_mul_ps(wt637, postMul55);
wt638 = _mm512_mul_ps(wt638, postMul55);
wt639 = _mm512_mul_ps(wt639, postMul55);
wt640 = _mm512_mul_ps(wt640, postMul55);
wt641 = _mm512_mul_ps(wt641, postMul55);
wt642 = _mm512_mul_ps(wt642, postMul55);
wt643 = _mm512_mul_ps(wt643, postMul55);
wt644 = _mm512_mul_ps(wt644, postMul55);
wt645 = _mm512_mul_ps(wt645, postMul55);
wt646 = _mm512_mul_ps(wt646, postMul55);
wt647 = _mm512_mul_ps(wt647, postMul55);
wt648 = _mm512_mul_ps(wt648, postMul55);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(1+16*c53)+(ptrdiff_t)0, 63>>cut23, wt633);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(2+16*c53)+(ptrdiff_t)0, 63>>cut23, wt634);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(3+16*c53)+(ptrdiff_t)0, 63>>cut23, wt635);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(4+16*c53)+(ptrdiff_t)0, 63>>cut23, wt636);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(5+16*c53)+(ptrdiff_t)0, 63>>cut23, wt637);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(6+16*c53)+(ptrdiff_t)0, 63>>cut23, wt638);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(7+16*c53)+(ptrdiff_t)0, 63>>cut23, wt639);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(8+16*c53)+(ptrdiff_t)0, 63>>cut23, wt640);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(9+16*c53)+(ptrdiff_t)0, 63>>cut23, wt641);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(10+16*c53)+(ptrdiff_t)0, 63>>cut23, wt642);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(11+16*c53)+(ptrdiff_t)0, 63>>cut23, wt643);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(12+16*c53)+(ptrdiff_t)0, 63>>cut23, wt644);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(13+16*c53)+(ptrdiff_t)0, 63>>cut23, wt645);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(14+16*c53)+(ptrdiff_t)0, 63>>cut23, wt646);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(15+16*c53)+(ptrdiff_t)0, 63>>cut23, wt647);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(16+16*c53)+(ptrdiff_t)0, 63>>cut23, wt648);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(1+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt633);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(2+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt634);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(3+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt635);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(4+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt636);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(5+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt637);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(6+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt638);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(7+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt639);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(8+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt640);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(9+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt641);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(10+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt642);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(11+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt643);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(12+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt644);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(13+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt645);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(14+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt646);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(15+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt647);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(16+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt648);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(1+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt633);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(2+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt634);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(3+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt635);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(4+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt636);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(5+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt637);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(6+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt638);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(7+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt639);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(8+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt640);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(9+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt641);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(10+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt642);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(11+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt643);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(12+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt644);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(13+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt645);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(14+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt646);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(15+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt647);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(16+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt648);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(1+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt633);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(2+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt634);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(3+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt635);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(4+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt636);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(5+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt637);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(6+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt638);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(7+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt639);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(8+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt640);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(9+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt641);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(10+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt642);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(11+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt643);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(12+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt644);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(13+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt645);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(14+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt646);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(15+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt647);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(16+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt648);
}
}
}
} else {
ptrdiff_t k160 = 1008;
ptrdiff_t l68 = (size_t)(0+k160)/6;
ptrdiff_t cut22 = (size_t)(0+k160)%6;
__m512 sum496 = _mm512_maskz_loadu_ps(65535, biasPtr17+4096*i64+4*k160);
__m512i pmMul37 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd37 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo31 = _mm512_loadu_ps(bnPtr18+(ptrdiff_t)8*(k160+1024*i64));
__m512 masHi31 = _mm512_maskz_loadu_ps(65535, bnPtr18+(ptrdiff_t)8*(k160+1024*i64)+(ptrdiff_t)64);
__m512 postMul53 = _mm512_permutex2var_ps(masLo31, pmMul37, masHi31);
__m512 postAdd35 = _mm512_permutex2var_ps(masLo31, pmAdd37, masHi31);
sum496 = _mm512_fmadd_ps(sum496, postMul53, postAdd35);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*0+(ptrdiff_t)0, 63>>cut22, sum496);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*0+(ptrdiff_t)12288, 4032>>cut22, sum496);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*0+(ptrdiff_t)24576, 65535-(4095>>cut22), sum496);
ptrdiff_t c51 = 0;
for (; c51 != 32; ++c51) {
__m512 wt601 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)0);
__m512 wt602 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)2048);
__m512 wt603 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)4096);
__m512 wt604 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)6144);
__m512 wt605 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)8192);
__m512 wt606 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)10240);
__m512 wt607 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)12288);
__m512 wt608 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)14336);
__m512 wt609 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)16384);
__m512 wt610 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)18432);
__m512 wt611 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)20480);
__m512 wt612 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)22528);
__m512 wt613 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)24576);
__m512 wt614 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)26624);
__m512 wt615 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)28672);
__m512 wt616 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)30720);
__m512 tmp14083 = _mm512_unpacklo_ps(wt601, wt602);
__m512 tmp14084 = _mm512_unpackhi_ps(wt601, wt602);
__m512 tmp14085 = _mm512_unpacklo_ps(wt603, wt604);
__m512 tmp14086 = _mm512_unpackhi_ps(wt603, wt604);
__m512 tmp14087 = _mm512_unpacklo_ps(wt605, wt606);
__m512 tmp14088 = _mm512_unpackhi_ps(wt605, wt606);
__m512 tmp14089 = _mm512_unpacklo_ps(wt607, wt608);
__m512 tmp14090 = _mm512_unpackhi_ps(wt607, wt608);
__m512 tmp14091 = _mm512_unpacklo_ps(wt609, wt610);
__m512 tmp14092 = _mm512_unpackhi_ps(wt609, wt610);
__m512 tmp14093 = _mm512_unpacklo_ps(wt611, wt612);
__m512 tmp14094 = _mm512_unpackhi_ps(wt611, wt612);
__m512 tmp14095 = _mm512_unpacklo_ps(wt613, wt614);
__m512 tmp14096 = _mm512_unpackhi_ps(wt613, wt614);
__m512 tmp14097 = _mm512_unpacklo_ps(wt615, wt616);
__m512 tmp14098 = _mm512_unpackhi_ps(wt615, wt616);
__m512 tmp14099 = _mm512_shuffle_ps(tmp14083, tmp14085, 68);
__m512 tmp14100 = _mm512_shuffle_ps(tmp14083, tmp14085, 238);
__m512 tmp14101 = _mm512_shuffle_ps(tmp14084, tmp14086, 68);
__m512 tmp14102 = _mm512_shuffle_ps(tmp14084, tmp14086, 238);
__m512 tmp14103 = _mm512_shuffle_ps(tmp14087, tmp14089, 68);
__m512 tmp14104 = _mm512_shuffle_ps(tmp14087, tmp14089, 238);
__m512 tmp14105 = _mm512_shuffle_ps(tmp14088, tmp14090, 68);
__m512 tmp14106 = _mm512_shuffle_ps(tmp14088, tmp14090, 238);
__m512 tmp14107 = _mm512_shuffle_ps(tmp14091, tmp14093, 68);
__m512 tmp14108 = _mm512_shuffle_ps(tmp14091, tmp14093, 238);
__m512 tmp14109 = _mm512_shuffle_ps(tmp14092, tmp14094, 68);
__m512 tmp14110 = _mm512_shuffle_ps(tmp14092, tmp14094, 238);
__m512 tmp14111 = _mm512_shuffle_ps(tmp14095, tmp14097, 68);
__m512 tmp14112 = _mm512_shuffle_ps(tmp14095, tmp14097, 238);
__m512 tmp14113 = _mm512_shuffle_ps(tmp14096, tmp14098, 68);
__m512 tmp14114 = _mm512_shuffle_ps(tmp14096, tmp14098, 238);
__m512 tmp14115 = _mm512_shuffle_f32x4(tmp14099, tmp14103, 136);
__m512 tmp14116 = _mm512_shuffle_f32x4(tmp14099, tmp14103, 221);
__m512 tmp14117 = _mm512_shuffle_f32x4(tmp14100, tmp14104, 136);
__m512 tmp14118 = _mm512_shuffle_f32x4(tmp14100, tmp14104, 221);
__m512 tmp14119 = _mm512_shuffle_f32x4(tmp14101, tmp14105, 136);
__m512 tmp14120 = _mm512_shuffle_f32x4(tmp14101, tmp14105, 221);
__m512 tmp14121 = _mm512_shuffle_f32x4(tmp14102, tmp14106, 136);
__m512 tmp14122 = _mm512_shuffle_f32x4(tmp14102, tmp14106, 221);
__m512 tmp14123 = _mm512_shuffle_f32x4(tmp14107, tmp14111, 136);
__m512 tmp14124 = _mm512_shuffle_f32x4(tmp14107, tmp14111, 221);
__m512 tmp14125 = _mm512_shuffle_f32x4(tmp14108, tmp14112, 136);
__m512 tmp14126 = _mm512_shuffle_f32x4(tmp14108, tmp14112, 221);
__m512 tmp14127 = _mm512_shuffle_f32x4(tmp14109, tmp14113, 136);
__m512 tmp14128 = _mm512_shuffle_f32x4(tmp14109, tmp14113, 221);
__m512 tmp14129 = _mm512_shuffle_f32x4(tmp14110, tmp14114, 136);
__m512 tmp14130 = _mm512_shuffle_f32x4(tmp14110, tmp14114, 221);
wt601 = _mm512_shuffle_f32x4(tmp14115, tmp14123, 136);
wt609 = _mm512_shuffle_f32x4(tmp14115, tmp14123, 221);
wt602 = _mm512_shuffle_f32x4(tmp14117, tmp14125, 136);
wt610 = _mm512_shuffle_f32x4(tmp14117, tmp14125, 221);
wt603 = _mm512_shuffle_f32x4(tmp14119, tmp14127, 136);
wt611 = _mm512_shuffle_f32x4(tmp14119, tmp14127, 221);
wt604 = _mm512_shuffle_f32x4(tmp14121, tmp14129, 136);
wt612 = _mm512_shuffle_f32x4(tmp14121, tmp14129, 221);
wt605 = _mm512_shuffle_f32x4(tmp14116, tmp14124, 136);
wt613 = _mm512_shuffle_f32x4(tmp14116, tmp14124, 221);
wt606 = _mm512_shuffle_f32x4(tmp14118, tmp14126, 136);
wt614 = _mm512_shuffle_f32x4(tmp14118, tmp14126, 221);
wt607 = _mm512_shuffle_f32x4(tmp14120, tmp14128, 136);
wt615 = _mm512_shuffle_f32x4(tmp14120, tmp14128, 221);
wt608 = _mm512_shuffle_f32x4(tmp14122, tmp14130, 136);
wt616 = _mm512_shuffle_f32x4(tmp14122, tmp14130, 221);
wt601 = _mm512_mul_ps(wt601, postMul53);
wt602 = _mm512_mul_ps(wt602, postMul53);
wt603 = _mm512_mul_ps(wt603, postMul53);
wt604 = _mm512_mul_ps(wt604, postMul53);
wt605 = _mm512_mul_ps(wt605, postMul53);
wt606 = _mm512_mul_ps(wt606, postMul53);
wt607 = _mm512_mul_ps(wt607, postMul53);
wt608 = _mm512_mul_ps(wt608, postMul53);
wt609 = _mm512_mul_ps(wt609, postMul53);
wt610 = _mm512_mul_ps(wt610, postMul53);
wt611 = _mm512_mul_ps(wt611, postMul53);
wt612 = _mm512_mul_ps(wt612, postMul53);
wt613 = _mm512_mul_ps(wt613, postMul53);
wt614 = _mm512_mul_ps(wt614, postMul53);
wt615 = _mm512_mul_ps(wt615, postMul53);
wt616 = _mm512_mul_ps(wt616, postMul53);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(1+16*c51)+(ptrdiff_t)0, 63>>cut22, wt601);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(2+16*c51)+(ptrdiff_t)0, 63>>cut22, wt602);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(3+16*c51)+(ptrdiff_t)0, 63>>cut22, wt603);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(4+16*c51)+(ptrdiff_t)0, 63>>cut22, wt604);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(5+16*c51)+(ptrdiff_t)0, 63>>cut22, wt605);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(6+16*c51)+(ptrdiff_t)0, 63>>cut22, wt606);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(7+16*c51)+(ptrdiff_t)0, 63>>cut22, wt607);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(8+16*c51)+(ptrdiff_t)0, 63>>cut22, wt608);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(9+16*c51)+(ptrdiff_t)0, 63>>cut22, wt609);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(10+16*c51)+(ptrdiff_t)0, 63>>cut22, wt610);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(11+16*c51)+(ptrdiff_t)0, 63>>cut22, wt611);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(12+16*c51)+(ptrdiff_t)0, 63>>cut22, wt612);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(13+16*c51)+(ptrdiff_t)0, 63>>cut22, wt613);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(14+16*c51)+(ptrdiff_t)0, 63>>cut22, wt614);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(15+16*c51)+(ptrdiff_t)0, 63>>cut22, wt615);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(16+16*c51)+(ptrdiff_t)0, 63>>cut22, wt616);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(1+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt601);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(2+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt602);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(3+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt603);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(4+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt604);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(5+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt605);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(6+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt606);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(7+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt607);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(8+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt608);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(9+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt609);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(10+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt610);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(11+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt611);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(12+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt612);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(13+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt613);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(14+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt614);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(15+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt615);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(16+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt616);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(1+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt601);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(2+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt602);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(3+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt603);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(4+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt604);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(5+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt605);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(6+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt606);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(7+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt607);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(8+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt608);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(9+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt609);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(10+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt610);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(11+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt611);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(12+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt612);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(13+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt613);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(14+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt614);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(15+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt615);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(16+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt616);
}
}
}
}
}

static void ResNeXt50OneArrangeWts10(ResNeXt50ThreaderTeam1* team66, char** tensors105) {
ResNeXt50ThreaderTask1 task109;
task109.callee1 = ResNeXt50OneArrangeWts10Callee1;
task109.any1 = tensors105;
task109.nd1 = 3;
task109.hull1[0] = 64;
task109.hull1[1] = 1;
task109.hull1[2] = 1;
ResNeXt50ThreaderDo1(team66, &task109);
}

static void ResNeXt50OneArrangeDats10Callee1(ResNeXt50ThreaderTask1* task110, int64_t* pt60) {
char** tensors108 = task110->any1;
ptrdiff_t s62 = pt60[0];
ptrdiff_t c54 = pt60[1];
char*restrict datPtr33 = tensors108[0]+(ptrdiff_t)0+(ptrdiff_t)694720*0+(ptrdiff_t)425984*0;
char*restrict arranged20 = tensors108[1]+(ptrdiff_t)694720*0+(ptrdiff_t)425984*0;
ptrdiff_t ii47 = 1;
for (ptrdiff_t i65 = 0; i65 < ii47; ++i65) {
ptrdiff_t j57 = 1*c54;
ptrdiff_t jj55 = j57+0;
for (; j57 != 3; ++j57) {
ptrdiff_t k162 = 128*s62;
ptrdiff_t kk54 = k162+128;
for (; k162 < kk54; ++k162) {
__m512 dat2452 = _mm512_maskz_loadu_ps(65535, datPtr33+425984*i65+256*j57+832*k162+(ptrdiff_t)0);
__m512 dat2453 = _mm512_maskz_loadu_ps(65535, datPtr33+425984*i65+256*j57+832*k162+(ptrdiff_t)64);
__m512 dat2454 = _mm512_maskz_loadu_ps(65535, datPtr33+425984*i65+256*j57+832*k162+(ptrdiff_t)128);
__m512 dat2455 = _mm512_maskz_loadu_ps(65535, datPtr33+425984*i65+256*j57+832*k162+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged20+425984*i65+131072*j57+256*k162+(ptrdiff_t)0, 65535, dat2452);
_mm512_mask_storeu_ps(arranged20+425984*i65+131072*j57+256*k162+(ptrdiff_t)64, 65535, dat2453);
_mm512_mask_storeu_ps(arranged20+425984*i65+131072*j57+256*k162+(ptrdiff_t)128, 65535, dat2454);
_mm512_mask_storeu_ps(arranged20+425984*i65+131072*j57+256*k162+(ptrdiff_t)192, 65535, dat2455);
}
if (j57 >= jj55) goto next10;
}
ptrdiff_t k163 = 128*s62;
ptrdiff_t kk55 = k163+128;
for (; k163 < kk55; ++k163) {
__m512 dat2456 = _mm512_maskz_loadu_ps(15, datPtr33+425984*i65+256*j57+832*k163+(ptrdiff_t)0);
_mm512_mask_storeu_ps(arranged20+425984*i65+131072*j57+64*k163+(ptrdiff_t)0, 15, dat2456);
}
next10:;
}
}

static void ResNeXt50OneArrangeDats10(ResNeXt50ThreaderTeam1* team67, char** tensors107) {
ResNeXt50ThreaderTask1 task111;
task111.callee1 = ResNeXt50OneArrangeDats10Callee1;
task111.any1 = tensors107;
task111.nd1 = 4;
task111.hull1[0] = 4;
task111.hull1[1] = 4;
task111.hull1[2] = 1;
task111.hull1[3] = 1;
ResNeXt50ThreaderDo1(team67, &task111);
}

static void ResNeXt50OneApply10Callee1(ResNeXt50ThreaderTask1* task112, int64_t* pt61) {
void** pair26 = task112->any1;
char** tensors110 = pair26[0];
ptrdiff_t e32 = 0;
ptrdiff_t g35 = 0;
ptrdiff_t d22 = pt61[1];
ptrdiff_t w68 = pt61[0];
char*restrict arrangedWts10 = tensors110[0]+3424256*e32+(ptrdiff_t)2101248*1*g35;
char*restrict arrangedDats10 = tensors110[1]+694720*e32+(ptrdiff_t)425984*1*g35;
char*restrict datPtr34 = tensors110[2]+(ptrdiff_t)851968*1*g35;
char*restrict datPtr35 = tensors110[3]+(ptrdiff_t)851968*1*g35;
ptrdiff_t ii48 = 1;
for (ptrdiff_t i66 = 0; i66 < ii48; ++i66) {
ptrdiff_t j58 = 1*d22;
ptrdiff_t jj56 = j58+0;
for (; j58 != 3; ++j58) {
ptrdiff_t k164 = 1*w68;
ptrdiff_t kk56 = k164+0;
for (; k164 != 170; ++k164) {
ptrdiff_t s63 = -1;
__m512 sum499 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)24));
__m512 sum503 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)28));
__m512 sum507 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)32));
__m512 sum511 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)36));
__m512 sum515 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)40));
__m512 sum519 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)44));
__m512 sum500 = sum499;
__m512 sum501 = sum499;
__m512 sum502 = sum499;
__m512 sum504 = sum503;
__m512 sum505 = sum503;
__m512 sum506 = sum503;
__m512 sum508 = sum507;
__m512 sum509 = sum507;
__m512 sum510 = sum507;
__m512 sum512 = sum511;
__m512 sum513 = sum511;
__m512 sum514 = sum511;
__m512 sum516 = sum515;
__m512 sum517 = sum515;
__m512 sum518 = sum515;
__m512 sum520 = sum519;
__m512 sum521 = sum519;
__m512 sum522 = sum519;
for (s63 = 0; s63 < 512; ++s63) {
__m512 dat2457 = _mm512_loadu_ps(arrangedDats10+425984*i66+131072*j58+256*s63+(ptrdiff_t)0);
__m512 dat2458 = _mm512_loadu_ps(arrangedDats10+425984*i66+131072*j58+256*s63+(ptrdiff_t)64);
__m512 dat2459 = _mm512_loadu_ps(arrangedDats10+425984*i66+131072*j58+256*s63+(ptrdiff_t)128);
__m512 dat2460 = _mm512_loadu_ps(arrangedDats10+425984*i66+131072*j58+256*s63+(ptrdiff_t)192);
__m512 wt649 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)24));
sum499 = _mm512_fmadd_ps(wt649, dat2457, sum499);
sum500 = _mm512_fmadd_ps(wt649, dat2458, sum500);
sum501 = _mm512_fmadd_ps(wt649, dat2459, sum501);
sum502 = _mm512_fmadd_ps(wt649, dat2460, sum502);
__m512 wt650 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)28));
sum503 = _mm512_fmadd_ps(wt650, dat2457, sum503);
sum504 = _mm512_fmadd_ps(wt650, dat2458, sum504);
sum505 = _mm512_fmadd_ps(wt650, dat2459, sum505);
sum506 = _mm512_fmadd_ps(wt650, dat2460, sum506);
__m512 wt651 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)32));
sum507 = _mm512_fmadd_ps(wt651, dat2457, sum507);
sum508 = _mm512_fmadd_ps(wt651, dat2458, sum508);
sum509 = _mm512_fmadd_ps(wt651, dat2459, sum509);
sum510 = _mm512_fmadd_ps(wt651, dat2460, sum510);
__m512 wt652 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)36));
sum511 = _mm512_fmadd_ps(wt652, dat2457, sum511);
sum512 = _mm512_fmadd_ps(wt652, dat2458, sum512);
sum513 = _mm512_fmadd_ps(wt652, dat2459, sum513);
sum514 = _mm512_fmadd_ps(wt652, dat2460, sum514);
__m512 wt653 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)40));
sum515 = _mm512_fmadd_ps(wt653, dat2457, sum515);
sum516 = _mm512_fmadd_ps(wt653, dat2458, sum516);
sum517 = _mm512_fmadd_ps(wt653, dat2459, sum517);
sum518 = _mm512_fmadd_ps(wt653, dat2460, sum518);
__m512 wt654 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)44));
sum519 = _mm512_fmadd_ps(wt654, dat2457, sum519);
sum520 = _mm512_fmadd_ps(wt654, dat2458, sum520);
sum521 = _mm512_fmadd_ps(wt654, dat2459, sum521);
sum522 = _mm512_fmadd_ps(wt654, dat2460, sum522);
}
sum499 = _mm512_add_ps(sum499, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)0));
sum500 = _mm512_add_ps(sum500, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)64));
sum501 = _mm512_add_ps(sum501, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)128));
sum502 = _mm512_add_ps(sum502, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)192));
sum499 = _mm512_max_ps(_mm512_setzero_ps(), sum499);
sum500 = _mm512_max_ps(_mm512_setzero_ps(), sum500);
sum501 = _mm512_max_ps(_mm512_setzero_ps(), sum501);
sum502 = _mm512_max_ps(_mm512_setzero_ps(), sum502);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)0, 65535, sum499);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)64, 65535, sum500);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)128, 65535, sum501);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)192, 65535, sum502);
sum503 = _mm512_add_ps(sum503, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)832));
sum504 = _mm512_add_ps(sum504, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)896));
sum505 = _mm512_add_ps(sum505, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)960));
sum506 = _mm512_add_ps(sum506, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)1024));
sum503 = _mm512_max_ps(_mm512_setzero_ps(), sum503);
sum504 = _mm512_max_ps(_mm512_setzero_ps(), sum504);
sum505 = _mm512_max_ps(_mm512_setzero_ps(), sum505);
sum506 = _mm512_max_ps(_mm512_setzero_ps(), sum506);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)832, 65535, sum503);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)896, 65535, sum504);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)960, 65535, sum505);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)1024, 65535, sum506);
sum507 = _mm512_add_ps(sum507, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)1664));
sum508 = _mm512_add_ps(sum508, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)1728));
sum509 = _mm512_add_ps(sum509, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)1792));
sum510 = _mm512_add_ps(sum510, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)1856));
sum507 = _mm512_max_ps(_mm512_setzero_ps(), sum507);
sum508 = _mm512_max_ps(_mm512_setzero_ps(), sum508);
sum509 = _mm512_max_ps(_mm512_setzero_ps(), sum509);
sum510 = _mm512_max_ps(_mm512_setzero_ps(), sum510);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)1664, 65535, sum507);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)1728, 65535, sum508);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)1792, 65535, sum509);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)1856, 65535, sum510);
sum511 = _mm512_add_ps(sum511, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)2496));
sum512 = _mm512_add_ps(sum512, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)2560));
sum513 = _mm512_add_ps(sum513, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)2624));
sum514 = _mm512_add_ps(sum514, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)2688));
sum511 = _mm512_max_ps(_mm512_setzero_ps(), sum511);
sum512 = _mm512_max_ps(_mm512_setzero_ps(), sum512);
sum513 = _mm512_max_ps(_mm512_setzero_ps(), sum513);
sum514 = _mm512_max_ps(_mm512_setzero_ps(), sum514);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)2496, 65535, sum511);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)2560, 65535, sum512);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)2624, 65535, sum513);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)2688, 65535, sum514);
sum515 = _mm512_add_ps(sum515, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)3328));
sum516 = _mm512_add_ps(sum516, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)3392));
sum517 = _mm512_add_ps(sum517, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)3456));
sum518 = _mm512_add_ps(sum518, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)3520));
sum515 = _mm512_max_ps(_mm512_setzero_ps(), sum515);
sum516 = _mm512_max_ps(_mm512_setzero_ps(), sum516);
sum517 = _mm512_max_ps(_mm512_setzero_ps(), sum517);
sum518 = _mm512_max_ps(_mm512_setzero_ps(), sum518);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)3328, 65535, sum515);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)3392, 65535, sum516);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)3456, 65535, sum517);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)3520, 65535, sum518);
sum519 = _mm512_add_ps(sum519, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)4160));
sum520 = _mm512_add_ps(sum520, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)4224));
sum521 = _mm512_add_ps(sum521, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)4288));
sum522 = _mm512_add_ps(sum522, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)4352));
sum519 = _mm512_max_ps(_mm512_setzero_ps(), sum519);
sum520 = _mm512_max_ps(_mm512_setzero_ps(), sum520);
sum521 = _mm512_max_ps(_mm512_setzero_ps(), sum521);
sum522 = _mm512_max_ps(_mm512_setzero_ps(), sum522);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)4160, 65535, sum519);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)4224, 65535, sum520);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)4288, 65535, sum521);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)4352, 65535, sum522);
if (k164 >= kk56) return;
}
ptrdiff_t s64 = -1;
__m512 sum523 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+16*s64+(ptrdiff_t)16));
__m512 sum527 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+16*s64+(ptrdiff_t)20));
__m512 sum531 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+16*s64+(ptrdiff_t)24));
__m512 sum535 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+16*s64+(ptrdiff_t)28));
__m512 sum524 = sum523;
__m512 sum525 = sum523;
__m512 sum526 = sum523;
__m512 sum528 = sum527;
__m512 sum529 = sum527;
__m512 sum530 = sum527;
__m512 sum532 = sum531;
__m512 sum533 = sum531;
__m512 sum534 = sum531;
__m512 sum536 = sum535;
__m512 sum537 = sum535;
__m512 sum538 = sum535;
for (s64 = 0; s64 < 512; ++s64) {
__m512 dat2461 = _mm512_loadu_ps(arrangedDats10+425984*i66+131072*j58+256*s64+(ptrdiff_t)0);
__m512 dat2462 = _mm512_loadu_ps(arrangedDats10+425984*i66+131072*j58+256*s64+(ptrdiff_t)64);
__m512 dat2463 = _mm512_loadu_ps(arrangedDats10+425984*i66+131072*j58+256*s64+(ptrdiff_t)128);
__m512 dat2464 = _mm512_loadu_ps(arrangedDats10+425984*i66+131072*j58+256*s64+(ptrdiff_t)192);
__m512 wt655 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+16*s64+(ptrdiff_t)16));
sum523 = _mm512_fmadd_ps(wt655, dat2461, sum523);
sum524 = _mm512_fmadd_ps(wt655, dat2462, sum524);
sum525 = _mm512_fmadd_ps(wt655, dat2463, sum525);
sum526 = _mm512_fmadd_ps(wt655, dat2464, sum526);
__m512 wt656 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+16*s64+(ptrdiff_t)20));
sum527 = _mm512_fmadd_ps(wt656, dat2461, sum527);
sum528 = _mm512_fmadd_ps(wt656, dat2462, sum528);
sum529 = _mm512_fmadd_ps(wt656, dat2463, sum529);
sum530 = _mm512_fmadd_ps(wt656, dat2464, sum530);
__m512 wt657 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+16*s64+(ptrdiff_t)24));
sum531 = _mm512_fmadd_ps(wt657, dat2461, sum531);
sum532 = _mm512_fmadd_ps(wt657, dat2462, sum532);
sum533 = _mm512_fmadd_ps(wt657, dat2463, sum533);
sum534 = _mm512_fmadd_ps(wt657, dat2464, sum534);
__m512 wt658 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+16*s64+(ptrdiff_t)28));
sum535 = _mm512_fmadd_ps(wt658, dat2461, sum535);
sum536 = _mm512_fmadd_ps(wt658, dat2462, sum536);
sum537 = _mm512_fmadd_ps(wt658, dat2463, sum537);
sum538 = _mm512_fmadd_ps(wt658, dat2464, sum538);
}
sum523 = _mm512_add_ps(sum523, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)0));
sum524 = _mm512_add_ps(sum524, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)64));
sum525 = _mm512_add_ps(sum525, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)128));
sum526 = _mm512_add_ps(sum526, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)192));
sum523 = _mm512_max_ps(_mm512_setzero_ps(), sum523);
sum524 = _mm512_max_ps(_mm512_setzero_ps(), sum524);
sum525 = _mm512_max_ps(_mm512_setzero_ps(), sum525);
sum526 = _mm512_max_ps(_mm512_setzero_ps(), sum526);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)0, 65535, sum523);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)64, 65535, sum524);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)128, 65535, sum525);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)192, 65535, sum526);
sum527 = _mm512_add_ps(sum527, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)832));
sum528 = _mm512_add_ps(sum528, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)896));
sum529 = _mm512_add_ps(sum529, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)960));
sum530 = _mm512_add_ps(sum530, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)1024));
sum527 = _mm512_max_ps(_mm512_setzero_ps(), sum527);
sum528 = _mm512_max_ps(_mm512_setzero_ps(), sum528);
sum529 = _mm512_max_ps(_mm512_setzero_ps(), sum529);
sum530 = _mm512_max_ps(_mm512_setzero_ps(), sum530);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)832, 65535, sum527);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)896, 65535, sum528);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)960, 65535, sum529);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)1024, 65535, sum530);
sum531 = _mm512_add_ps(sum531, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)1664));
sum532 = _mm512_add_ps(sum532, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)1728));
sum533 = _mm512_add_ps(sum533, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)1792));
sum534 = _mm512_add_ps(sum534, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)1856));
sum531 = _mm512_max_ps(_mm512_setzero_ps(), sum531);
sum532 = _mm512_max_ps(_mm512_setzero_ps(), sum532);
sum533 = _mm512_max_ps(_mm512_setzero_ps(), sum533);
sum534 = _mm512_max_ps(_mm512_setzero_ps(), sum534);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)1664, 65535, sum531);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)1728, 65535, sum532);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)1792, 65535, sum533);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)1856, 65535, sum534);
sum535 = _mm512_add_ps(sum535, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)2496));
sum536 = _mm512_add_ps(sum536, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)2560));
sum537 = _mm512_add_ps(sum537, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)2624));
sum538 = _mm512_add_ps(sum538, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)2688));
sum535 = _mm512_max_ps(_mm512_setzero_ps(), sum535);
sum536 = _mm512_max_ps(_mm512_setzero_ps(), sum536);
sum537 = _mm512_max_ps(_mm512_setzero_ps(), sum537);
sum538 = _mm512_max_ps(_mm512_setzero_ps(), sum538);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)2496, 65535, sum535);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)2560, 65535, sum536);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)2624, 65535, sum537);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)2688, 65535, sum538);
if (j58 >= jj56) return;
}
ptrdiff_t k165 = 1*w68;
ptrdiff_t kk57 = k165+0;
for (; k165 != 170; ++k165) {
ptrdiff_t s65 = -1;
__m512 sum539 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)24));
__m512 sum540 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)28));
__m512 sum541 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)32));
__m512 sum542 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)36));
__m512 sum543 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)40));
__m512 sum544 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)44));
for (s65 = 0; s65 < 512; ++s65) {
__m512 dat2465 = _mm512_loadu_ps(arrangedDats10+425984*i66+131072*j58+64*s65+(ptrdiff_t)0);
__m512 wt659 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)24));
sum539 = _mm512_fmadd_ps(wt659, dat2465, sum539);
__m512 wt660 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)28));
sum540 = _mm512_fmadd_ps(wt660, dat2465, sum540);
__m512 wt661 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)32));
sum541 = _mm512_fmadd_ps(wt661, dat2465, sum541);
__m512 wt662 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)36));
sum542 = _mm512_fmadd_ps(wt662, dat2465, sum542);
__m512 wt663 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)40));
sum543 = _mm512_fmadd_ps(wt663, dat2465, sum543);
__m512 wt664 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)44));
sum544 = _mm512_fmadd_ps(wt664, dat2465, sum544);
}
sum539 = _mm512_add_ps(sum539, _mm512_maskz_loadu_ps(15, datPtr34+851968*i66+256*j58+4992*k165+(ptrdiff_t)0));
sum539 = _mm512_max_ps(_mm512_setzero_ps(), sum539);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k165+(ptrdiff_t)0, 15, sum539);
sum540 = _mm512_add_ps(sum540, _mm512_maskz_loadu_ps(15, datPtr34+851968*i66+256*j58+4992*k165+(ptrdiff_t)832));
sum540 = _mm512_max_ps(_mm512_setzero_ps(), sum540);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k165+(ptrdiff_t)832, 15, sum540);
sum541 = _mm512_add_ps(sum541, _mm512_maskz_loadu_ps(15, datPtr34+851968*i66+256*j58+4992*k165+(ptrdiff_t)1664));
sum541 = _mm512_max_ps(_mm512_setzero_ps(), sum541);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k165+(ptrdiff_t)1664, 15, sum541);
sum542 = _mm512_add_ps(sum542, _mm512_maskz_loadu_ps(15, datPtr34+851968*i66+256*j58+4992*k165+(ptrdiff_t)2496));
sum542 = _mm512_max_ps(_mm512_setzero_ps(), sum542);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k165+(ptrdiff_t)2496, 15, sum542);
sum543 = _mm512_add_ps(sum543, _mm512_maskz_loadu_ps(15, datPtr34+851968*i66+256*j58+4992*k165+(ptrdiff_t)3328));
sum543 = _mm512_max_ps(_mm512_setzero_ps(), sum543);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k165+(ptrdiff_t)3328, 15, sum543);
sum544 = _mm512_add_ps(sum544, _mm512_maskz_loadu_ps(15, datPtr34+851968*i66+256*j58+4992*k165+(ptrdiff_t)4160));
sum544 = _mm512_max_ps(_mm512_setzero_ps(), sum544);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k165+(ptrdiff_t)4160, 15, sum544);
if (k165 >= kk57) return;
}
ptrdiff_t s66 = -1;
__m512 sum545 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+16*s66+(ptrdiff_t)16));
__m512 sum546 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+16*s66+(ptrdiff_t)20));
__m512 sum547 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+16*s66+(ptrdiff_t)24));
__m512 sum548 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+16*s66+(ptrdiff_t)28));
for (s66 = 0; s66 < 512; ++s66) {
__m512 dat2466 = _mm512_loadu_ps(arrangedDats10+425984*i66+131072*j58+64*s66+(ptrdiff_t)0);
__m512 wt665 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+16*s66+(ptrdiff_t)16));
sum545 = _mm512_fmadd_ps(wt665, dat2466, sum545);
__m512 wt666 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+16*s66+(ptrdiff_t)20));
sum546 = _mm512_fmadd_ps(wt666, dat2466, sum546);
__m512 wt667 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+16*s66+(ptrdiff_t)24));
sum547 = _mm512_fmadd_ps(wt667, dat2466, sum547);
__m512 wt668 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+16*s66+(ptrdiff_t)28));
sum548 = _mm512_fmadd_ps(wt668, dat2466, sum548);
}
sum545 = _mm512_add_ps(sum545, _mm512_maskz_loadu_ps(15, datPtr34+851968*i66+256*j58+4992*k165+(ptrdiff_t)0));
sum545 = _mm512_max_ps(_mm512_setzero_ps(), sum545);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k165+(ptrdiff_t)0, 15, sum545);
sum546 = _mm512_add_ps(sum546, _mm512_maskz_loadu_ps(15, datPtr34+851968*i66+256*j58+4992*k165+(ptrdiff_t)832));
sum546 = _mm512_max_ps(_mm512_setzero_ps(), sum546);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k165+(ptrdiff_t)832, 15, sum546);
sum547 = _mm512_add_ps(sum547, _mm512_maskz_loadu_ps(15, datPtr34+851968*i66+256*j58+4992*k165+(ptrdiff_t)1664));
sum547 = _mm512_max_ps(_mm512_setzero_ps(), sum547);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k165+(ptrdiff_t)1664, 15, sum547);
sum548 = _mm512_add_ps(sum548, _mm512_maskz_loadu_ps(15, datPtr34+851968*i66+256*j58+4992*k165+(ptrdiff_t)2496));
sum548 = _mm512_max_ps(_mm512_setzero_ps(), sum548);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k165+(ptrdiff_t)2496, 15, sum548);
}
}

static void ResNeXt50OneApply10(ResNeXt50ThreaderTeam1* team68, char** tensors109) {
void* pair25[] = {tensors109, 0};
ResNeXt50ThreaderTask1 task113;
task113.callee1 = ResNeXt50OneApply10Callee1;
task113.any1 = pair25;
task113.nd1 = 3;
task113.hull1[0] = 171;
task113.hull1[1] = 4;
task113.hull1[2] = 1;
ResNeXt50ThreaderDo1(team68, &task113);
}

static void ResNeXt50OneArrangeWts11Callee1(ResNeXt50ThreaderTask1* task114, int64_t* pt62) {
char** tensors112 = task114->any1;
ptrdiff_t b82 = pt62[0];
char*restrict wtPtr18 = tensors112[0]+(ptrdiff_t)3340*0+(ptrdiff_t)2097152*0;
char*restrict biasPtr18 = tensors112[1]+(ptrdiff_t)2048*0;
char*restrict bnPtr19 = tensors112[2]+(ptrdiff_t)8*512*0;
char*restrict arranged21 = tensors112[3]+(ptrdiff_t)1712128*0+(ptrdiff_t)2099200*0;
ptrdiff_t ii49 = 1;
for (ptrdiff_t i67 = 0; i67 < ii49; ++i67) {
ptrdiff_t j59 = 1*b82;
ptrdiff_t jj57 = j59+1;
for (; j59 < jj57; ++j59) {
if (j59 < 31) {
ptrdiff_t k167 = 0+16*(j59-0);
ptrdiff_t l71 = (size_t)(0+k167)/6;
ptrdiff_t cut25 = (size_t)(0+k167)%6;
switch (cut25) {
case 0:;
case 2: {
__m512 sum550 = _mm512_maskz_loadu_ps(65535, biasPtr18+2048*i67+4*k167);
__m512i pmMul38 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd38 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo32 = _mm512_loadu_ps(bnPtr19+(ptrdiff_t)8*(k167+512*i67));
__m512 masHi32 = _mm512_maskz_loadu_ps(65535, bnPtr19+(ptrdiff_t)8*(k167+512*i67)+(ptrdiff_t)64);
__m512 postMul57 = _mm512_permutex2var_ps(masLo32, pmMul38, masHi32);
__m512 postAdd39 = _mm512_permutex2var_ps(masLo32, pmAdd38, masHi32);
sum550 = _mm512_fmadd_ps(sum550, postMul57, postAdd39);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*0+(ptrdiff_t)0, 63>>cut25, sum550);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*0+(ptrdiff_t)24576, 4032>>cut25, sum550);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*0+(ptrdiff_t)49152, 65535-(4095>>cut25), sum550);
ptrdiff_t c56 = 0;
for (; c56 != 64; ++c56) {
__m512 wt685 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)0);
__m512 wt686 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)4096);
__m512 wt687 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)8192);
__m512 wt688 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)12288);
__m512 wt689 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)16384);
__m512 wt690 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)20480);
__m512 wt691 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)24576);
__m512 wt692 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)28672);
__m512 wt693 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)32768);
__m512 wt694 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)36864);
__m512 wt695 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)40960);
__m512 wt696 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)45056);
__m512 wt697 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)49152);
__m512 wt698 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)53248);
__m512 wt699 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)57344);
__m512 wt700 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)61440);
__m512 tmp14131 = _mm512_unpacklo_ps(wt685, wt686);
__m512 tmp14132 = _mm512_unpackhi_ps(wt685, wt686);
__m512 tmp14133 = _mm512_unpacklo_ps(wt687, wt688);
__m512 tmp14134 = _mm512_unpackhi_ps(wt687, wt688);
__m512 tmp14135 = _mm512_unpacklo_ps(wt689, wt690);
__m512 tmp14136 = _mm512_unpackhi_ps(wt689, wt690);
__m512 tmp14137 = _mm512_unpacklo_ps(wt691, wt692);
__m512 tmp14138 = _mm512_unpackhi_ps(wt691, wt692);
__m512 tmp14139 = _mm512_unpacklo_ps(wt693, wt694);
__m512 tmp14140 = _mm512_unpackhi_ps(wt693, wt694);
__m512 tmp14141 = _mm512_unpacklo_ps(wt695, wt696);
__m512 tmp14142 = _mm512_unpackhi_ps(wt695, wt696);
__m512 tmp14143 = _mm512_unpacklo_ps(wt697, wt698);
__m512 tmp14144 = _mm512_unpackhi_ps(wt697, wt698);
__m512 tmp14145 = _mm512_unpacklo_ps(wt699, wt700);
__m512 tmp14146 = _mm512_unpackhi_ps(wt699, wt700);
__m512 tmp14147 = _mm512_shuffle_ps(tmp14131, tmp14133, 68);
__m512 tmp14148 = _mm512_shuffle_ps(tmp14131, tmp14133, 238);
__m512 tmp14149 = _mm512_shuffle_ps(tmp14132, tmp14134, 68);
__m512 tmp14150 = _mm512_shuffle_ps(tmp14132, tmp14134, 238);
__m512 tmp14151 = _mm512_shuffle_ps(tmp14135, tmp14137, 68);
__m512 tmp14152 = _mm512_shuffle_ps(tmp14135, tmp14137, 238);
__m512 tmp14153 = _mm512_shuffle_ps(tmp14136, tmp14138, 68);
__m512 tmp14154 = _mm512_shuffle_ps(tmp14136, tmp14138, 238);
__m512 tmp14155 = _mm512_shuffle_ps(tmp14139, tmp14141, 68);
__m512 tmp14156 = _mm512_shuffle_ps(tmp14139, tmp14141, 238);
__m512 tmp14157 = _mm512_shuffle_ps(tmp14140, tmp14142, 68);
__m512 tmp14158 = _mm512_shuffle_ps(tmp14140, tmp14142, 238);
__m512 tmp14159 = _mm512_shuffle_ps(tmp14143, tmp14145, 68);
__m512 tmp14160 = _mm512_shuffle_ps(tmp14143, tmp14145, 238);
__m512 tmp14161 = _mm512_shuffle_ps(tmp14144, tmp14146, 68);
__m512 tmp14162 = _mm512_shuffle_ps(tmp14144, tmp14146, 238);
__m512 tmp14163 = _mm512_shuffle_f32x4(tmp14147, tmp14151, 136);
__m512 tmp14164 = _mm512_shuffle_f32x4(tmp14147, tmp14151, 221);
__m512 tmp14165 = _mm512_shuffle_f32x4(tmp14148, tmp14152, 136);
__m512 tmp14166 = _mm512_shuffle_f32x4(tmp14148, tmp14152, 221);
__m512 tmp14167 = _mm512_shuffle_f32x4(tmp14149, tmp14153, 136);
__m512 tmp14168 = _mm512_shuffle_f32x4(tmp14149, tmp14153, 221);
__m512 tmp14169 = _mm512_shuffle_f32x4(tmp14150, tmp14154, 136);
__m512 tmp14170 = _mm512_shuffle_f32x4(tmp14150, tmp14154, 221);
__m512 tmp14171 = _mm512_shuffle_f32x4(tmp14155, tmp14159, 136);
__m512 tmp14172 = _mm512_shuffle_f32x4(tmp14155, tmp14159, 221);
__m512 tmp14173 = _mm512_shuffle_f32x4(tmp14156, tmp14160, 136);
__m512 tmp14174 = _mm512_shuffle_f32x4(tmp14156, tmp14160, 221);
__m512 tmp14175 = _mm512_shuffle_f32x4(tmp14157, tmp14161, 136);
__m512 tmp14176 = _mm512_shuffle_f32x4(tmp14157, tmp14161, 221);
__m512 tmp14177 = _mm512_shuffle_f32x4(tmp14158, tmp14162, 136);
__m512 tmp14178 = _mm512_shuffle_f32x4(tmp14158, tmp14162, 221);
wt685 = _mm512_shuffle_f32x4(tmp14163, tmp14171, 136);
wt693 = _mm512_shuffle_f32x4(tmp14163, tmp14171, 221);
wt686 = _mm512_shuffle_f32x4(tmp14165, tmp14173, 136);
wt694 = _mm512_shuffle_f32x4(tmp14165, tmp14173, 221);
wt687 = _mm512_shuffle_f32x4(tmp14167, tmp14175, 136);
wt695 = _mm512_shuffle_f32x4(tmp14167, tmp14175, 221);
wt688 = _mm512_shuffle_f32x4(tmp14169, tmp14177, 136);
wt696 = _mm512_shuffle_f32x4(tmp14169, tmp14177, 221);
wt689 = _mm512_shuffle_f32x4(tmp14164, tmp14172, 136);
wt697 = _mm512_shuffle_f32x4(tmp14164, tmp14172, 221);
wt690 = _mm512_shuffle_f32x4(tmp14166, tmp14174, 136);
wt698 = _mm512_shuffle_f32x4(tmp14166, tmp14174, 221);
wt691 = _mm512_shuffle_f32x4(tmp14168, tmp14176, 136);
wt699 = _mm512_shuffle_f32x4(tmp14168, tmp14176, 221);
wt692 = _mm512_shuffle_f32x4(tmp14170, tmp14178, 136);
wt700 = _mm512_shuffle_f32x4(tmp14170, tmp14178, 221);
wt685 = _mm512_mul_ps(wt685, postMul57);
wt686 = _mm512_mul_ps(wt686, postMul57);
wt687 = _mm512_mul_ps(wt687, postMul57);
wt688 = _mm512_mul_ps(wt688, postMul57);
wt689 = _mm512_mul_ps(wt689, postMul57);
wt690 = _mm512_mul_ps(wt690, postMul57);
wt691 = _mm512_mul_ps(wt691, postMul57);
wt692 = _mm512_mul_ps(wt692, postMul57);
wt693 = _mm512_mul_ps(wt693, postMul57);
wt694 = _mm512_mul_ps(wt694, postMul57);
wt695 = _mm512_mul_ps(wt695, postMul57);
wt696 = _mm512_mul_ps(wt696, postMul57);
wt697 = _mm512_mul_ps(wt697, postMul57);
wt698 = _mm512_mul_ps(wt698, postMul57);
wt699 = _mm512_mul_ps(wt699, postMul57);
wt700 = _mm512_mul_ps(wt700, postMul57);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(1+16*c56)+(ptrdiff_t)0, 63>>cut25, wt685);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(2+16*c56)+(ptrdiff_t)0, 63>>cut25, wt686);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(3+16*c56)+(ptrdiff_t)0, 63>>cut25, wt687);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(4+16*c56)+(ptrdiff_t)0, 63>>cut25, wt688);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(5+16*c56)+(ptrdiff_t)0, 63>>cut25, wt689);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(6+16*c56)+(ptrdiff_t)0, 63>>cut25, wt690);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(7+16*c56)+(ptrdiff_t)0, 63>>cut25, wt691);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(8+16*c56)+(ptrdiff_t)0, 63>>cut25, wt692);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(9+16*c56)+(ptrdiff_t)0, 63>>cut25, wt693);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(10+16*c56)+(ptrdiff_t)0, 63>>cut25, wt694);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(11+16*c56)+(ptrdiff_t)0, 63>>cut25, wt695);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(12+16*c56)+(ptrdiff_t)0, 63>>cut25, wt696);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(13+16*c56)+(ptrdiff_t)0, 63>>cut25, wt697);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(14+16*c56)+(ptrdiff_t)0, 63>>cut25, wt698);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(15+16*c56)+(ptrdiff_t)0, 63>>cut25, wt699);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(16+16*c56)+(ptrdiff_t)0, 63>>cut25, wt700);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(1+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt685);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(2+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt686);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(3+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt687);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(4+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt688);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(5+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt689);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(6+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt690);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(7+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt691);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(8+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt692);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(9+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt693);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(10+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt694);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(11+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt695);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(12+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt696);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(13+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt697);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(14+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt698);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(15+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt699);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(16+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt700);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(1+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt685);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(2+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt686);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(3+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt687);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(4+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt688);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(5+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt689);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(6+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt690);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(7+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt691);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(8+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt692);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(9+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt693);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(10+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt694);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(11+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt695);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(12+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt696);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(13+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt697);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(14+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt698);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(15+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt699);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(16+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt700);
}
break;
}
default: {
cut25 = 4;
__m512 sum551 = _mm512_maskz_loadu_ps(65535, biasPtr18+2048*i67+4*k167);
__m512i pmMul39 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd39 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo33 = _mm512_loadu_ps(bnPtr19+(ptrdiff_t)8*(k167+512*i67));
__m512 masHi33 = _mm512_maskz_loadu_ps(65535, bnPtr19+(ptrdiff_t)8*(k167+512*i67)+(ptrdiff_t)64);
__m512 postMul58 = _mm512_permutex2var_ps(masLo33, pmMul39, masHi33);
__m512 postAdd40 = _mm512_permutex2var_ps(masLo33, pmAdd39, masHi33);
sum551 = _mm512_fmadd_ps(sum551, postMul58, postAdd40);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*0+(ptrdiff_t)0, 63>>cut25, sum551);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*0+(ptrdiff_t)24576, 4032>>cut25, sum551);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*0+(ptrdiff_t)49152, 258048>>cut25, sum551);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*0+(ptrdiff_t)73728, 65535-(262143>>cut25), sum551);
ptrdiff_t c57 = 0;
for (; c57 != 64; ++c57) {
__m512 wt701 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)0);
__m512 wt702 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)4096);
__m512 wt703 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)8192);
__m512 wt704 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)12288);
__m512 wt705 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)16384);
__m512 wt706 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)20480);
__m512 wt707 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)24576);
__m512 wt708 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)28672);
__m512 wt709 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)32768);
__m512 wt710 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)36864);
__m512 wt711 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)40960);
__m512 wt712 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)45056);
__m512 wt713 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)49152);
__m512 wt714 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)53248);
__m512 wt715 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)57344);
__m512 wt716 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)61440);
__m512 tmp14179 = _mm512_unpacklo_ps(wt701, wt702);
__m512 tmp14180 = _mm512_unpackhi_ps(wt701, wt702);
__m512 tmp14181 = _mm512_unpacklo_ps(wt703, wt704);
__m512 tmp14182 = _mm512_unpackhi_ps(wt703, wt704);
__m512 tmp14183 = _mm512_unpacklo_ps(wt705, wt706);
__m512 tmp14184 = _mm512_unpackhi_ps(wt705, wt706);
__m512 tmp14185 = _mm512_unpacklo_ps(wt707, wt708);
__m512 tmp14186 = _mm512_unpackhi_ps(wt707, wt708);
__m512 tmp14187 = _mm512_unpacklo_ps(wt709, wt710);
__m512 tmp14188 = _mm512_unpackhi_ps(wt709, wt710);
__m512 tmp14189 = _mm512_unpacklo_ps(wt711, wt712);
__m512 tmp14190 = _mm512_unpackhi_ps(wt711, wt712);
__m512 tmp14191 = _mm512_unpacklo_ps(wt713, wt714);
__m512 tmp14192 = _mm512_unpackhi_ps(wt713, wt714);
__m512 tmp14193 = _mm512_unpacklo_ps(wt715, wt716);
__m512 tmp14194 = _mm512_unpackhi_ps(wt715, wt716);
__m512 tmp14195 = _mm512_shuffle_ps(tmp14179, tmp14181, 68);
__m512 tmp14196 = _mm512_shuffle_ps(tmp14179, tmp14181, 238);
__m512 tmp14197 = _mm512_shuffle_ps(tmp14180, tmp14182, 68);
__m512 tmp14198 = _mm512_shuffle_ps(tmp14180, tmp14182, 238);
__m512 tmp14199 = _mm512_shuffle_ps(tmp14183, tmp14185, 68);
__m512 tmp14200 = _mm512_shuffle_ps(tmp14183, tmp14185, 238);
__m512 tmp14201 = _mm512_shuffle_ps(tmp14184, tmp14186, 68);
__m512 tmp14202 = _mm512_shuffle_ps(tmp14184, tmp14186, 238);
__m512 tmp14203 = _mm512_shuffle_ps(tmp14187, tmp14189, 68);
__m512 tmp14204 = _mm512_shuffle_ps(tmp14187, tmp14189, 238);
__m512 tmp14205 = _mm512_shuffle_ps(tmp14188, tmp14190, 68);
__m512 tmp14206 = _mm512_shuffle_ps(tmp14188, tmp14190, 238);
__m512 tmp14207 = _mm512_shuffle_ps(tmp14191, tmp14193, 68);
__m512 tmp14208 = _mm512_shuffle_ps(tmp14191, tmp14193, 238);
__m512 tmp14209 = _mm512_shuffle_ps(tmp14192, tmp14194, 68);
__m512 tmp14210 = _mm512_shuffle_ps(tmp14192, tmp14194, 238);
__m512 tmp14211 = _mm512_shuffle_f32x4(tmp14195, tmp14199, 136);
__m512 tmp14212 = _mm512_shuffle_f32x4(tmp14195, tmp14199, 221);
__m512 tmp14213 = _mm512_shuffle_f32x4(tmp14196, tmp14200, 136);
__m512 tmp14214 = _mm512_shuffle_f32x4(tmp14196, tmp14200, 221);
__m512 tmp14215 = _mm512_shuffle_f32x4(tmp14197, tmp14201, 136);
__m512 tmp14216 = _mm512_shuffle_f32x4(tmp14197, tmp14201, 221);
__m512 tmp14217 = _mm512_shuffle_f32x4(tmp14198, tmp14202, 136);
__m512 tmp14218 = _mm512_shuffle_f32x4(tmp14198, tmp14202, 221);
__m512 tmp14219 = _mm512_shuffle_f32x4(tmp14203, tmp14207, 136);
__m512 tmp14220 = _mm512_shuffle_f32x4(tmp14203, tmp14207, 221);
__m512 tmp14221 = _mm512_shuffle_f32x4(tmp14204, tmp14208, 136);
__m512 tmp14222 = _mm512_shuffle_f32x4(tmp14204, tmp14208, 221);
__m512 tmp14223 = _mm512_shuffle_f32x4(tmp14205, tmp14209, 136);
__m512 tmp14224 = _mm512_shuffle_f32x4(tmp14205, tmp14209, 221);
__m512 tmp14225 = _mm512_shuffle_f32x4(tmp14206, tmp14210, 136);
__m512 tmp14226 = _mm512_shuffle_f32x4(tmp14206, tmp14210, 221);
wt701 = _mm512_shuffle_f32x4(tmp14211, tmp14219, 136);
wt709 = _mm512_shuffle_f32x4(tmp14211, tmp14219, 221);
wt702 = _mm512_shuffle_f32x4(tmp14213, tmp14221, 136);
wt710 = _mm512_shuffle_f32x4(tmp14213, tmp14221, 221);
wt703 = _mm512_shuffle_f32x4(tmp14215, tmp14223, 136);
wt711 = _mm512_shuffle_f32x4(tmp14215, tmp14223, 221);
wt704 = _mm512_shuffle_f32x4(tmp14217, tmp14225, 136);
wt712 = _mm512_shuffle_f32x4(tmp14217, tmp14225, 221);
wt705 = _mm512_shuffle_f32x4(tmp14212, tmp14220, 136);
wt713 = _mm512_shuffle_f32x4(tmp14212, tmp14220, 221);
wt706 = _mm512_shuffle_f32x4(tmp14214, tmp14222, 136);
wt714 = _mm512_shuffle_f32x4(tmp14214, tmp14222, 221);
wt707 = _mm512_shuffle_f32x4(tmp14216, tmp14224, 136);
wt715 = _mm512_shuffle_f32x4(tmp14216, tmp14224, 221);
wt708 = _mm512_shuffle_f32x4(tmp14218, tmp14226, 136);
wt716 = _mm512_shuffle_f32x4(tmp14218, tmp14226, 221);
wt701 = _mm512_mul_ps(wt701, postMul58);
wt702 = _mm512_mul_ps(wt702, postMul58);
wt703 = _mm512_mul_ps(wt703, postMul58);
wt704 = _mm512_mul_ps(wt704, postMul58);
wt705 = _mm512_mul_ps(wt705, postMul58);
wt706 = _mm512_mul_ps(wt706, postMul58);
wt707 = _mm512_mul_ps(wt707, postMul58);
wt708 = _mm512_mul_ps(wt708, postMul58);
wt709 = _mm512_mul_ps(wt709, postMul58);
wt710 = _mm512_mul_ps(wt710, postMul58);
wt711 = _mm512_mul_ps(wt711, postMul58);
wt712 = _mm512_mul_ps(wt712, postMul58);
wt713 = _mm512_mul_ps(wt713, postMul58);
wt714 = _mm512_mul_ps(wt714, postMul58);
wt715 = _mm512_mul_ps(wt715, postMul58);
wt716 = _mm512_mul_ps(wt716, postMul58);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(1+16*c57)+(ptrdiff_t)0, 63>>cut25, wt701);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(2+16*c57)+(ptrdiff_t)0, 63>>cut25, wt702);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(3+16*c57)+(ptrdiff_t)0, 63>>cut25, wt703);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(4+16*c57)+(ptrdiff_t)0, 63>>cut25, wt704);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(5+16*c57)+(ptrdiff_t)0, 63>>cut25, wt705);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(6+16*c57)+(ptrdiff_t)0, 63>>cut25, wt706);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(7+16*c57)+(ptrdiff_t)0, 63>>cut25, wt707);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(8+16*c57)+(ptrdiff_t)0, 63>>cut25, wt708);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(9+16*c57)+(ptrdiff_t)0, 63>>cut25, wt709);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(10+16*c57)+(ptrdiff_t)0, 63>>cut25, wt710);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(11+16*c57)+(ptrdiff_t)0, 63>>cut25, wt711);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(12+16*c57)+(ptrdiff_t)0, 63>>cut25, wt712);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(13+16*c57)+(ptrdiff_t)0, 63>>cut25, wt713);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(14+16*c57)+(ptrdiff_t)0, 63>>cut25, wt714);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(15+16*c57)+(ptrdiff_t)0, 63>>cut25, wt715);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(16+16*c57)+(ptrdiff_t)0, 63>>cut25, wt716);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(1+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt701);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(2+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt702);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(3+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt703);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(4+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt704);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(5+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt705);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(6+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt706);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(7+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt707);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(8+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt708);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(9+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt709);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(10+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt710);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(11+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt711);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(12+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt712);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(13+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt713);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(14+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt714);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(15+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt715);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(16+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt716);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(1+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt701);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(2+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt702);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(3+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt703);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(4+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt704);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(5+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt705);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(6+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt706);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(7+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt707);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(8+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt708);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(9+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt709);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(10+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt710);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(11+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt711);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(12+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt712);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(13+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt713);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(14+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt714);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(15+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt715);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(16+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt716);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(1+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt701);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(2+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt702);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(3+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt703);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(4+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt704);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(5+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt705);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(6+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt706);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(7+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt707);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(8+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt708);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(9+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt709);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(10+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt710);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(11+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt711);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(12+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt712);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(13+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt713);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(14+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt714);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(15+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt715);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(16+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt716);
}
}
}
} else {
ptrdiff_t k166 = 496;
ptrdiff_t l70 = (size_t)(0+k166)/6;
ptrdiff_t cut24 = (size_t)(0+k166)%6;
__m512 sum549 = _mm512_maskz_loadu_ps(65535, biasPtr18+2048*i67+4*k166);
__m512i pmMul40 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd40 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo34 = _mm512_loadu_ps(bnPtr19+(ptrdiff_t)8*(k166+512*i67));
__m512 masHi34 = _mm512_maskz_loadu_ps(65535, bnPtr19+(ptrdiff_t)8*(k166+512*i67)+(ptrdiff_t)64);
__m512 postMul56 = _mm512_permutex2var_ps(masLo34, pmMul40, masHi34);
__m512 postAdd38 = _mm512_permutex2var_ps(masLo34, pmAdd40, masHi34);
sum549 = _mm512_fmadd_ps(sum549, postMul56, postAdd38);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*0+(ptrdiff_t)0, 63>>cut24, sum549);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*0+(ptrdiff_t)24576, 4032>>cut24, sum549);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*0+(ptrdiff_t)49152, 258048>>cut24, sum549);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*0+(ptrdiff_t)73728, 65535-(262143>>cut24), sum549);
ptrdiff_t c55 = 0;
for (; c55 != 64; ++c55) {
__m512 wt669 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)0);
__m512 wt670 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)4096);
__m512 wt671 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)8192);
__m512 wt672 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)12288);
__m512 wt673 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)16384);
__m512 wt674 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)20480);
__m512 wt675 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)24576);
__m512 wt676 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)28672);
__m512 wt677 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)32768);
__m512 wt678 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)36864);
__m512 wt679 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)40960);
__m512 wt680 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)45056);
__m512 wt681 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)49152);
__m512 wt682 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)53248);
__m512 wt683 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)57344);
__m512 wt684 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)61440);
__m512 tmp14227 = _mm512_unpacklo_ps(wt669, wt670);
__m512 tmp14228 = _mm512_unpackhi_ps(wt669, wt670);
__m512 tmp14229 = _mm512_unpacklo_ps(wt671, wt672);
__m512 tmp14230 = _mm512_unpackhi_ps(wt671, wt672);
__m512 tmp14231 = _mm512_unpacklo_ps(wt673, wt674);
__m512 tmp14232 = _mm512_unpackhi_ps(wt673, wt674);
__m512 tmp14233 = _mm512_unpacklo_ps(wt675, wt676);
__m512 tmp14234 = _mm512_unpackhi_ps(wt675, wt676);
__m512 tmp14235 = _mm512_unpacklo_ps(wt677, wt678);
__m512 tmp14236 = _mm512_unpackhi_ps(wt677, wt678);
__m512 tmp14237 = _mm512_unpacklo_ps(wt679, wt680);
__m512 tmp14238 = _mm512_unpackhi_ps(wt679, wt680);
__m512 tmp14239 = _mm512_unpacklo_ps(wt681, wt682);
__m512 tmp14240 = _mm512_unpackhi_ps(wt681, wt682);
__m512 tmp14241 = _mm512_unpacklo_ps(wt683, wt684);
__m512 tmp14242 = _mm512_unpackhi_ps(wt683, wt684);
__m512 tmp14243 = _mm512_shuffle_ps(tmp14227, tmp14229, 68);
__m512 tmp14244 = _mm512_shuffle_ps(tmp14227, tmp14229, 238);
__m512 tmp14245 = _mm512_shuffle_ps(tmp14228, tmp14230, 68);
__m512 tmp14246 = _mm512_shuffle_ps(tmp14228, tmp14230, 238);
__m512 tmp14247 = _mm512_shuffle_ps(tmp14231, tmp14233, 68);
__m512 tmp14248 = _mm512_shuffle_ps(tmp14231, tmp14233, 238);
__m512 tmp14249 = _mm512_shuffle_ps(tmp14232, tmp14234, 68);
__m512 tmp14250 = _mm512_shuffle_ps(tmp14232, tmp14234, 238);
__m512 tmp14251 = _mm512_shuffle_ps(tmp14235, tmp14237, 68);
__m512 tmp14252 = _mm512_shuffle_ps(tmp14235, tmp14237, 238);
__m512 tmp14253 = _mm512_shuffle_ps(tmp14236, tmp14238, 68);
__m512 tmp14254 = _mm512_shuffle_ps(tmp14236, tmp14238, 238);
__m512 tmp14255 = _mm512_shuffle_ps(tmp14239, tmp14241, 68);
__m512 tmp14256 = _mm512_shuffle_ps(tmp14239, tmp14241, 238);
__m512 tmp14257 = _mm512_shuffle_ps(tmp14240, tmp14242, 68);
__m512 tmp14258 = _mm512_shuffle_ps(tmp14240, tmp14242, 238);
__m512 tmp14259 = _mm512_shuffle_f32x4(tmp14243, tmp14247, 136);
__m512 tmp14260 = _mm512_shuffle_f32x4(tmp14243, tmp14247, 221);
__m512 tmp14261 = _mm512_shuffle_f32x4(tmp14244, tmp14248, 136);
__m512 tmp14262 = _mm512_shuffle_f32x4(tmp14244, tmp14248, 221);
__m512 tmp14263 = _mm512_shuffle_f32x4(tmp14245, tmp14249, 136);
__m512 tmp14264 = _mm512_shuffle_f32x4(tmp14245, tmp14249, 221);
__m512 tmp14265 = _mm512_shuffle_f32x4(tmp14246, tmp14250, 136);
__m512 tmp14266 = _mm512_shuffle_f32x4(tmp14246, tmp14250, 221);
__m512 tmp14267 = _mm512_shuffle_f32x4(tmp14251, tmp14255, 136);
__m512 tmp14268 = _mm512_shuffle_f32x4(tmp14251, tmp14255, 221);
__m512 tmp14269 = _mm512_shuffle_f32x4(tmp14252, tmp14256, 136);
__m512 tmp14270 = _mm512_shuffle_f32x4(tmp14252, tmp14256, 221);
__m512 tmp14271 = _mm512_shuffle_f32x4(tmp14253, tmp14257, 136);
__m512 tmp14272 = _mm512_shuffle_f32x4(tmp14253, tmp14257, 221);
__m512 tmp14273 = _mm512_shuffle_f32x4(tmp14254, tmp14258, 136);
__m512 tmp14274 = _mm512_shuffle_f32x4(tmp14254, tmp14258, 221);
wt669 = _mm512_shuffle_f32x4(tmp14259, tmp14267, 136);
wt677 = _mm512_shuffle_f32x4(tmp14259, tmp14267, 221);
wt670 = _mm512_shuffle_f32x4(tmp14261, tmp14269, 136);
wt678 = _mm512_shuffle_f32x4(tmp14261, tmp14269, 221);
wt671 = _mm512_shuffle_f32x4(tmp14263, tmp14271, 136);
wt679 = _mm512_shuffle_f32x4(tmp14263, tmp14271, 221);
wt672 = _mm512_shuffle_f32x4(tmp14265, tmp14273, 136);
wt680 = _mm512_shuffle_f32x4(tmp14265, tmp14273, 221);
wt673 = _mm512_shuffle_f32x4(tmp14260, tmp14268, 136);
wt681 = _mm512_shuffle_f32x4(tmp14260, tmp14268, 221);
wt674 = _mm512_shuffle_f32x4(tmp14262, tmp14270, 136);
wt682 = _mm512_shuffle_f32x4(tmp14262, tmp14270, 221);
wt675 = _mm512_shuffle_f32x4(tmp14264, tmp14272, 136);
wt683 = _mm512_shuffle_f32x4(tmp14264, tmp14272, 221);
wt676 = _mm512_shuffle_f32x4(tmp14266, tmp14274, 136);
wt684 = _mm512_shuffle_f32x4(tmp14266, tmp14274, 221);
wt669 = _mm512_mul_ps(wt669, postMul56);
wt670 = _mm512_mul_ps(wt670, postMul56);
wt671 = _mm512_mul_ps(wt671, postMul56);
wt672 = _mm512_mul_ps(wt672, postMul56);
wt673 = _mm512_mul_ps(wt673, postMul56);
wt674 = _mm512_mul_ps(wt674, postMul56);
wt675 = _mm512_mul_ps(wt675, postMul56);
wt676 = _mm512_mul_ps(wt676, postMul56);
wt677 = _mm512_mul_ps(wt677, postMul56);
wt678 = _mm512_mul_ps(wt678, postMul56);
wt679 = _mm512_mul_ps(wt679, postMul56);
wt680 = _mm512_mul_ps(wt680, postMul56);
wt681 = _mm512_mul_ps(wt681, postMul56);
wt682 = _mm512_mul_ps(wt682, postMul56);
wt683 = _mm512_mul_ps(wt683, postMul56);
wt684 = _mm512_mul_ps(wt684, postMul56);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(1+16*c55)+(ptrdiff_t)0, 63>>cut24, wt669);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(2+16*c55)+(ptrdiff_t)0, 63>>cut24, wt670);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(3+16*c55)+(ptrdiff_t)0, 63>>cut24, wt671);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(4+16*c55)+(ptrdiff_t)0, 63>>cut24, wt672);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(5+16*c55)+(ptrdiff_t)0, 63>>cut24, wt673);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(6+16*c55)+(ptrdiff_t)0, 63>>cut24, wt674);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(7+16*c55)+(ptrdiff_t)0, 63>>cut24, wt675);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(8+16*c55)+(ptrdiff_t)0, 63>>cut24, wt676);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(9+16*c55)+(ptrdiff_t)0, 63>>cut24, wt677);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(10+16*c55)+(ptrdiff_t)0, 63>>cut24, wt678);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(11+16*c55)+(ptrdiff_t)0, 63>>cut24, wt679);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(12+16*c55)+(ptrdiff_t)0, 63>>cut24, wt680);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(13+16*c55)+(ptrdiff_t)0, 63>>cut24, wt681);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(14+16*c55)+(ptrdiff_t)0, 63>>cut24, wt682);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(15+16*c55)+(ptrdiff_t)0, 63>>cut24, wt683);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(16+16*c55)+(ptrdiff_t)0, 63>>cut24, wt684);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(1+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt669);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(2+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt670);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(3+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt671);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(4+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt672);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(5+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt673);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(6+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt674);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(7+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt675);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(8+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt676);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(9+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt677);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(10+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt678);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(11+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt679);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(12+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt680);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(13+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt681);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(14+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt682);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(15+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt683);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(16+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt684);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(1+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt669);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(2+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt670);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(3+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt671);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(4+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt672);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(5+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt673);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(6+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt674);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(7+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt675);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(8+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt676);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(9+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt677);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(10+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt678);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(11+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt679);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(12+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt680);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(13+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt681);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(14+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt682);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(15+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt683);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(16+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt684);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(1+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt669);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(2+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt670);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(3+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt671);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(4+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt672);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(5+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt673);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(6+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt674);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(7+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt675);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(8+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt676);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(9+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt677);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(10+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt678);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(11+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt679);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(12+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt680);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(13+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt681);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(14+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt682);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(15+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt683);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(16+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt684);
}
}
}
}
}

static void ResNeXt50OneArrangeWts11(ResNeXt50ThreaderTeam1* team69, char** tensors111) {
ResNeXt50ThreaderTask1 task115;
task115.callee1 = ResNeXt50OneArrangeWts11Callee1;
task115.any1 = tensors111;
task115.nd1 = 3;
task115.hull1[0] = 32;
task115.hull1[1] = 1;
task115.hull1[2] = 1;
ResNeXt50ThreaderDo1(team69, &task115);
}

static void ResNeXt50OneArrangeDats11Callee1(ResNeXt50ThreaderTask1* task116, int64_t* pt63) {
char** tensors114 = task116->any1;
ptrdiff_t s67 = pt63[0];
ptrdiff_t c58 = pt63[1];
char*restrict datPtr36 = tensors114[0]+(ptrdiff_t)0+(ptrdiff_t)694720*0+(ptrdiff_t)851968*0;
char*restrict arranged22 = tensors114[1]+(ptrdiff_t)694720*0+(ptrdiff_t)851968*0;
ptrdiff_t ii50 = 1;
for (ptrdiff_t i68 = 0; i68 < ii50; ++i68) {
ptrdiff_t j60 = 1*c58;
ptrdiff_t jj58 = j60+0;
for (; j60 != 3; ++j60) {
ptrdiff_t k168 = 128*s67;
ptrdiff_t kk58 = k168+128;
for (; k168 < kk58; ++k168) {
__m512 dat2467 = _mm512_maskz_loadu_ps(65535, datPtr36+851968*i68+256*j60+832*k168+(ptrdiff_t)0);
__m512 dat2468 = _mm512_maskz_loadu_ps(65535, datPtr36+851968*i68+256*j60+832*k168+(ptrdiff_t)64);
__m512 dat2469 = _mm512_maskz_loadu_ps(65535, datPtr36+851968*i68+256*j60+832*k168+(ptrdiff_t)128);
__m512 dat2470 = _mm512_maskz_loadu_ps(65535, datPtr36+851968*i68+256*j60+832*k168+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged22+851968*i68+262144*j60+256*k168+(ptrdiff_t)0, 65535, dat2467);
_mm512_mask_storeu_ps(arranged22+851968*i68+262144*j60+256*k168+(ptrdiff_t)64, 65535, dat2468);
_mm512_mask_storeu_ps(arranged22+851968*i68+262144*j60+256*k168+(ptrdiff_t)128, 65535, dat2469);
_mm512_mask_storeu_ps(arranged22+851968*i68+262144*j60+256*k168+(ptrdiff_t)192, 65535, dat2470);
}
if (j60 >= jj58) goto next11;
}
ptrdiff_t k169 = 128*s67;
ptrdiff_t kk59 = k169+128;
for (; k169 < kk59; ++k169) {
__m512 dat2471 = _mm512_maskz_loadu_ps(15, datPtr36+851968*i68+256*j60+832*k169+(ptrdiff_t)0);
_mm512_mask_storeu_ps(arranged22+851968*i68+262144*j60+64*k169+(ptrdiff_t)0, 15, dat2471);
}
next11:;
}
}

static void ResNeXt50OneArrangeDats11(ResNeXt50ThreaderTeam1* team70, char** tensors113) {
ResNeXt50ThreaderTask1 task117;
task117.callee1 = ResNeXt50OneArrangeDats11Callee1;
task117.any1 = tensors113;
task117.nd1 = 4;
task117.hull1[0] = 8;
task117.hull1[1] = 4;
task117.hull1[2] = 1;
task117.hull1[3] = 1;
ResNeXt50ThreaderDo1(team70, &task117);
}

static void ResNeXt50OneApply11Callee1(ResNeXt50ThreaderTask1* task118, int64_t* pt64) {
void** pair28 = task118->any1;
char** tensors116 = pair28[0];
ptrdiff_t e33 = 0;
ptrdiff_t g36 = 0;
ptrdiff_t d23 = pt64[1];
ptrdiff_t w69 = pt64[0];
char*restrict arrangedWts11 = tensors116[0]+1712128*e33+(ptrdiff_t)2099200*1*g36;
char*restrict arrangedDats11 = tensors116[1]+694720*e33+(ptrdiff_t)851968*1*g36;
char*restrict datPtr37 = tensors116[2]+(ptrdiff_t)425984*1*g36;
ptrdiff_t ii51 = 1;
for (ptrdiff_t i69 = 0; i69 < ii51; ++i69) {
ptrdiff_t j61 = 1*d23;
ptrdiff_t jj59 = j61+0;
for (; j61 != 3; ++j61) {
ptrdiff_t k170 = 1*w69;
ptrdiff_t kk60 = k170+0;
for (; k170 != 85; ++k170) {
ptrdiff_t s68 = -1;
__m512 sum552 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)24));
__m512 sum556 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)28));
__m512 sum560 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)32));
__m512 sum564 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)36));
__m512 sum568 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)40));
__m512 sum572 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)44));
__m512 sum553 = sum552;
__m512 sum554 = sum552;
__m512 sum555 = sum552;
__m512 sum557 = sum556;
__m512 sum558 = sum556;
__m512 sum559 = sum556;
__m512 sum561 = sum560;
__m512 sum562 = sum560;
__m512 sum563 = sum560;
__m512 sum565 = sum564;
__m512 sum566 = sum564;
__m512 sum567 = sum564;
__m512 sum569 = sum568;
__m512 sum570 = sum568;
__m512 sum571 = sum568;
__m512 sum573 = sum572;
__m512 sum574 = sum572;
__m512 sum575 = sum572;
for (s68 = 0; s68 < 1024; ++s68) {
__m512 dat2472 = _mm512_loadu_ps(arrangedDats11+851968*i69+262144*j61+256*s68+(ptrdiff_t)0);
__m512 dat2473 = _mm512_loadu_ps(arrangedDats11+851968*i69+262144*j61+256*s68+(ptrdiff_t)64);
__m512 dat2474 = _mm512_loadu_ps(arrangedDats11+851968*i69+262144*j61+256*s68+(ptrdiff_t)128);
__m512 dat2475 = _mm512_loadu_ps(arrangedDats11+851968*i69+262144*j61+256*s68+(ptrdiff_t)192);
__m512 wt717 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)24));
sum552 = _mm512_fmadd_ps(wt717, dat2472, sum552);
sum553 = _mm512_fmadd_ps(wt717, dat2473, sum553);
sum554 = _mm512_fmadd_ps(wt717, dat2474, sum554);
sum555 = _mm512_fmadd_ps(wt717, dat2475, sum555);
__m512 wt718 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)28));
sum556 = _mm512_fmadd_ps(wt718, dat2472, sum556);
sum557 = _mm512_fmadd_ps(wt718, dat2473, sum557);
sum558 = _mm512_fmadd_ps(wt718, dat2474, sum558);
sum559 = _mm512_fmadd_ps(wt718, dat2475, sum559);
__m512 wt719 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)32));
sum560 = _mm512_fmadd_ps(wt719, dat2472, sum560);
sum561 = _mm512_fmadd_ps(wt719, dat2473, sum561);
sum562 = _mm512_fmadd_ps(wt719, dat2474, sum562);
sum563 = _mm512_fmadd_ps(wt719, dat2475, sum563);
__m512 wt720 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)36));
sum564 = _mm512_fmadd_ps(wt720, dat2472, sum564);
sum565 = _mm512_fmadd_ps(wt720, dat2473, sum565);
sum566 = _mm512_fmadd_ps(wt720, dat2474, sum566);
sum567 = _mm512_fmadd_ps(wt720, dat2475, sum567);
__m512 wt721 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)40));
sum568 = _mm512_fmadd_ps(wt721, dat2472, sum568);
sum569 = _mm512_fmadd_ps(wt721, dat2473, sum569);
sum570 = _mm512_fmadd_ps(wt721, dat2474, sum570);
sum571 = _mm512_fmadd_ps(wt721, dat2475, sum571);
__m512 wt722 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)44));
sum572 = _mm512_fmadd_ps(wt722, dat2472, sum572);
sum573 = _mm512_fmadd_ps(wt722, dat2473, sum573);
sum574 = _mm512_fmadd_ps(wt722, dat2474, sum574);
sum575 = _mm512_fmadd_ps(wt722, dat2475, sum575);
}
sum552 = _mm512_max_ps(_mm512_setzero_ps(), sum552);
sum553 = _mm512_max_ps(_mm512_setzero_ps(), sum553);
sum554 = _mm512_max_ps(_mm512_setzero_ps(), sum554);
sum555 = _mm512_max_ps(_mm512_setzero_ps(), sum555);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)0, 65535, sum552);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)64, 65535, sum553);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)128, 65535, sum554);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)192, 65535, sum555);
sum556 = _mm512_max_ps(_mm512_setzero_ps(), sum556);
sum557 = _mm512_max_ps(_mm512_setzero_ps(), sum557);
sum558 = _mm512_max_ps(_mm512_setzero_ps(), sum558);
sum559 = _mm512_max_ps(_mm512_setzero_ps(), sum559);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)832, 65535, sum556);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)896, 65535, sum557);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)960, 65535, sum558);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)1024, 65535, sum559);
sum560 = _mm512_max_ps(_mm512_setzero_ps(), sum560);
sum561 = _mm512_max_ps(_mm512_setzero_ps(), sum561);
sum562 = _mm512_max_ps(_mm512_setzero_ps(), sum562);
sum563 = _mm512_max_ps(_mm512_setzero_ps(), sum563);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)1664, 65535, sum560);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)1728, 65535, sum561);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)1792, 65535, sum562);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)1856, 65535, sum563);
sum564 = _mm512_max_ps(_mm512_setzero_ps(), sum564);
sum565 = _mm512_max_ps(_mm512_setzero_ps(), sum565);
sum566 = _mm512_max_ps(_mm512_setzero_ps(), sum566);
sum567 = _mm512_max_ps(_mm512_setzero_ps(), sum567);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)2496, 65535, sum564);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)2560, 65535, sum565);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)2624, 65535, sum566);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)2688, 65535, sum567);
sum568 = _mm512_max_ps(_mm512_setzero_ps(), sum568);
sum569 = _mm512_max_ps(_mm512_setzero_ps(), sum569);
sum570 = _mm512_max_ps(_mm512_setzero_ps(), sum570);
sum571 = _mm512_max_ps(_mm512_setzero_ps(), sum571);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)3328, 65535, sum568);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)3392, 65535, sum569);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)3456, 65535, sum570);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)3520, 65535, sum571);
sum572 = _mm512_max_ps(_mm512_setzero_ps(), sum572);
sum573 = _mm512_max_ps(_mm512_setzero_ps(), sum573);
sum574 = _mm512_max_ps(_mm512_setzero_ps(), sum574);
sum575 = _mm512_max_ps(_mm512_setzero_ps(), sum575);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)4160, 65535, sum572);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)4224, 65535, sum573);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)4288, 65535, sum574);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)4352, 65535, sum575);
if (k170 >= kk60) return;
}
ptrdiff_t s69 = -1;
__m512 sum576 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+8*s69+(ptrdiff_t)8));
__m512 sum580 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+8*s69+(ptrdiff_t)12));
__m512 sum577 = sum576;
__m512 sum578 = sum576;
__m512 sum579 = sum576;
__m512 sum581 = sum580;
__m512 sum582 = sum580;
__m512 sum583 = sum580;
for (s69 = 0; s69 < 1024; ++s69) {
__m512 dat2476 = _mm512_loadu_ps(arrangedDats11+851968*i69+262144*j61+256*s69+(ptrdiff_t)0);
__m512 dat2477 = _mm512_loadu_ps(arrangedDats11+851968*i69+262144*j61+256*s69+(ptrdiff_t)64);
__m512 dat2478 = _mm512_loadu_ps(arrangedDats11+851968*i69+262144*j61+256*s69+(ptrdiff_t)128);
__m512 dat2479 = _mm512_loadu_ps(arrangedDats11+851968*i69+262144*j61+256*s69+(ptrdiff_t)192);
__m512 wt723 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+8*s69+(ptrdiff_t)8));
sum576 = _mm512_fmadd_ps(wt723, dat2476, sum576);
sum577 = _mm512_fmadd_ps(wt723, dat2477, sum577);
sum578 = _mm512_fmadd_ps(wt723, dat2478, sum578);
sum579 = _mm512_fmadd_ps(wt723, dat2479, sum579);
__m512 wt724 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+8*s69+(ptrdiff_t)12));
sum580 = _mm512_fmadd_ps(wt724, dat2476, sum580);
sum581 = _mm512_fmadd_ps(wt724, dat2477, sum581);
sum582 = _mm512_fmadd_ps(wt724, dat2478, sum582);
sum583 = _mm512_fmadd_ps(wt724, dat2479, sum583);
}
sum576 = _mm512_max_ps(_mm512_setzero_ps(), sum576);
sum577 = _mm512_max_ps(_mm512_setzero_ps(), sum577);
sum578 = _mm512_max_ps(_mm512_setzero_ps(), sum578);
sum579 = _mm512_max_ps(_mm512_setzero_ps(), sum579);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)0, 65535, sum576);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)64, 65535, sum577);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)128, 65535, sum578);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)192, 65535, sum579);
sum580 = _mm512_max_ps(_mm512_setzero_ps(), sum580);
sum581 = _mm512_max_ps(_mm512_setzero_ps(), sum581);
sum582 = _mm512_max_ps(_mm512_setzero_ps(), sum582);
sum583 = _mm512_max_ps(_mm512_setzero_ps(), sum583);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)832, 65535, sum580);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)896, 65535, sum581);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)960, 65535, sum582);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)1024, 65535, sum583);
if (j61 >= jj59) return;
}
ptrdiff_t k171 = 1*w69;
ptrdiff_t kk61 = k171+0;
for (; k171 != 85; ++k171) {
ptrdiff_t s70 = -1;
__m512 sum584 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)24));
__m512 sum585 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)28));
__m512 sum586 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)32));
__m512 sum587 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)36));
__m512 sum588 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)40));
__m512 sum589 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)44));
for (s70 = 0; s70 < 1024; ++s70) {
__m512 dat2480 = _mm512_loadu_ps(arrangedDats11+851968*i69+262144*j61+64*s70+(ptrdiff_t)0);
__m512 wt725 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)24));
sum584 = _mm512_fmadd_ps(wt725, dat2480, sum584);
__m512 wt726 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)28));
sum585 = _mm512_fmadd_ps(wt726, dat2480, sum585);
__m512 wt727 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)32));
sum586 = _mm512_fmadd_ps(wt727, dat2480, sum586);
__m512 wt728 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)36));
sum587 = _mm512_fmadd_ps(wt728, dat2480, sum587);
__m512 wt729 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)40));
sum588 = _mm512_fmadd_ps(wt729, dat2480, sum588);
__m512 wt730 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)44));
sum589 = _mm512_fmadd_ps(wt730, dat2480, sum589);
}
sum584 = _mm512_max_ps(_mm512_setzero_ps(), sum584);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k171+(ptrdiff_t)0, 15, sum584);
sum585 = _mm512_max_ps(_mm512_setzero_ps(), sum585);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k171+(ptrdiff_t)832, 15, sum585);
sum586 = _mm512_max_ps(_mm512_setzero_ps(), sum586);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k171+(ptrdiff_t)1664, 15, sum586);
sum587 = _mm512_max_ps(_mm512_setzero_ps(), sum587);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k171+(ptrdiff_t)2496, 15, sum587);
sum588 = _mm512_max_ps(_mm512_setzero_ps(), sum588);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k171+(ptrdiff_t)3328, 15, sum588);
sum589 = _mm512_max_ps(_mm512_setzero_ps(), sum589);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k171+(ptrdiff_t)4160, 15, sum589);
if (k171 >= kk61) return;
}
ptrdiff_t s71 = -1;
__m512 sum590 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+8*s71+(ptrdiff_t)8));
__m512 sum591 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+8*s71+(ptrdiff_t)12));
for (s71 = 0; s71 < 1024; ++s71) {
__m512 dat2481 = _mm512_loadu_ps(arrangedDats11+851968*i69+262144*j61+64*s71+(ptrdiff_t)0);
__m512 wt731 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+8*s71+(ptrdiff_t)8));
sum590 = _mm512_fmadd_ps(wt731, dat2481, sum590);
__m512 wt732 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+8*s71+(ptrdiff_t)12));
sum591 = _mm512_fmadd_ps(wt732, dat2481, sum591);
}
sum590 = _mm512_max_ps(_mm512_setzero_ps(), sum590);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k171+(ptrdiff_t)0, 15, sum590);
sum591 = _mm512_max_ps(_mm512_setzero_ps(), sum591);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k171+(ptrdiff_t)832, 15, sum591);
}
}

static void ResNeXt50OneApply11(ResNeXt50ThreaderTeam1* team71, char** tensors115) {
void* pair27[] = {tensors115, 0};
ResNeXt50ThreaderTask1 task119;
task119.callee1 = ResNeXt50OneApply11Callee1;
task119.any1 = pair27;
task119.nd1 = 3;
task119.hull1[0] = 86;
task119.hull1[1] = 4;
task119.hull1[2] = 1;
ResNeXt50ThreaderDo1(team71, &task119);
}

static void ResNeXt50OneArrangeWts12Callee1(ResNeXt50ThreaderTask1* task128, int64_t* pt69) {
char** tensors126 = task128->any1;
ptrdiff_t b86 = pt69[0];
char*restrict wtPtr20 = tensors126[0]+(ptrdiff_t)3340*0+(ptrdiff_t)8388608*0;
char*restrict biasPtr20 = tensors126[1]+(ptrdiff_t)8192*0;
char*restrict bnPtr21 = tensors126[2]+(ptrdiff_t)8*2048*0;
char*restrict arranged23 = tensors126[3]+(ptrdiff_t)6848512*0+(ptrdiff_t)8396800*0;
ptrdiff_t ii55 = 1;
for (ptrdiff_t i75 = 0; i75 < ii55; ++i75) {
ptrdiff_t j66 = 1*b86;
ptrdiff_t jj61 = j66+1;
for (; j66 < jj61; ++j66) {
if (j66 < 127) {
ptrdiff_t k179 = 0+16*(j66-0);
ptrdiff_t l77 = (size_t)(0+k179)/6;
ptrdiff_t cut28 = (size_t)(0+k179)%6;
switch (cut28) {
case 0:;
case 2: {
__m512 sum629 = _mm512_maskz_loadu_ps(65535, biasPtr20+8192*i75+4*k179);
__m512i pmMul42 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd42 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo35 = _mm512_loadu_ps(bnPtr21+(ptrdiff_t)8*(k179+2048*i75));
__m512 masHi35 = _mm512_maskz_loadu_ps(65535, bnPtr21+(ptrdiff_t)8*(k179+2048*i75)+(ptrdiff_t)64);
__m512 postMul65 = _mm512_permutex2var_ps(masLo35, pmMul42, masHi35);
__m512 postAdd43 = _mm512_permutex2var_ps(masLo35, pmAdd42, masHi35);
sum629 = _mm512_fmadd_ps(sum629, postMul65, postAdd43);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*0+(ptrdiff_t)0, 63>>cut28, sum629);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*0+(ptrdiff_t)24576, 4032>>cut28, sum629);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*0+(ptrdiff_t)49152, 65535-(4095>>cut28), sum629);
ptrdiff_t c61 = 0;
for (; c61 != 64; ++c61) {
__m512 wt753 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)0);
__m512 wt754 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)4096);
__m512 wt755 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)8192);
__m512 wt756 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)12288);
__m512 wt757 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)16384);
__m512 wt758 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)20480);
__m512 wt759 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)24576);
__m512 wt760 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)28672);
__m512 wt761 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)32768);
__m512 wt762 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)36864);
__m512 wt763 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)40960);
__m512 wt764 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)45056);
__m512 wt765 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)49152);
__m512 wt766 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)53248);
__m512 wt767 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)57344);
__m512 wt768 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)61440);
__m512 tmp15491 = _mm512_unpacklo_ps(wt753, wt754);
__m512 tmp15492 = _mm512_unpackhi_ps(wt753, wt754);
__m512 tmp15493 = _mm512_unpacklo_ps(wt755, wt756);
__m512 tmp15494 = _mm512_unpackhi_ps(wt755, wt756);
__m512 tmp15495 = _mm512_unpacklo_ps(wt757, wt758);
__m512 tmp15496 = _mm512_unpackhi_ps(wt757, wt758);
__m512 tmp15497 = _mm512_unpacklo_ps(wt759, wt760);
__m512 tmp15498 = _mm512_unpackhi_ps(wt759, wt760);
__m512 tmp15499 = _mm512_unpacklo_ps(wt761, wt762);
__m512 tmp15500 = _mm512_unpackhi_ps(wt761, wt762);
__m512 tmp15501 = _mm512_unpacklo_ps(wt763, wt764);
__m512 tmp15502 = _mm512_unpackhi_ps(wt763, wt764);
__m512 tmp15503 = _mm512_unpacklo_ps(wt765, wt766);
__m512 tmp15504 = _mm512_unpackhi_ps(wt765, wt766);
__m512 tmp15505 = _mm512_unpacklo_ps(wt767, wt768);
__m512 tmp15506 = _mm512_unpackhi_ps(wt767, wt768);
__m512 tmp15507 = _mm512_shuffle_ps(tmp15491, tmp15493, 68);
__m512 tmp15508 = _mm512_shuffle_ps(tmp15491, tmp15493, 238);
__m512 tmp15509 = _mm512_shuffle_ps(tmp15492, tmp15494, 68);
__m512 tmp15510 = _mm512_shuffle_ps(tmp15492, tmp15494, 238);
__m512 tmp15511 = _mm512_shuffle_ps(tmp15495, tmp15497, 68);
__m512 tmp15512 = _mm512_shuffle_ps(tmp15495, tmp15497, 238);
__m512 tmp15513 = _mm512_shuffle_ps(tmp15496, tmp15498, 68);
__m512 tmp15514 = _mm512_shuffle_ps(tmp15496, tmp15498, 238);
__m512 tmp15515 = _mm512_shuffle_ps(tmp15499, tmp15501, 68);
__m512 tmp15516 = _mm512_shuffle_ps(tmp15499, tmp15501, 238);
__m512 tmp15517 = _mm512_shuffle_ps(tmp15500, tmp15502, 68);
__m512 tmp15518 = _mm512_shuffle_ps(tmp15500, tmp15502, 238);
__m512 tmp15519 = _mm512_shuffle_ps(tmp15503, tmp15505, 68);
__m512 tmp15520 = _mm512_shuffle_ps(tmp15503, tmp15505, 238);
__m512 tmp15521 = _mm512_shuffle_ps(tmp15504, tmp15506, 68);
__m512 tmp15522 = _mm512_shuffle_ps(tmp15504, tmp15506, 238);
__m512 tmp15523 = _mm512_shuffle_f32x4(tmp15507, tmp15511, 136);
__m512 tmp15524 = _mm512_shuffle_f32x4(tmp15507, tmp15511, 221);
__m512 tmp15525 = _mm512_shuffle_f32x4(tmp15508, tmp15512, 136);
__m512 tmp15526 = _mm512_shuffle_f32x4(tmp15508, tmp15512, 221);
__m512 tmp15527 = _mm512_shuffle_f32x4(tmp15509, tmp15513, 136);
__m512 tmp15528 = _mm512_shuffle_f32x4(tmp15509, tmp15513, 221);
__m512 tmp15529 = _mm512_shuffle_f32x4(tmp15510, tmp15514, 136);
__m512 tmp15530 = _mm512_shuffle_f32x4(tmp15510, tmp15514, 221);
__m512 tmp15531 = _mm512_shuffle_f32x4(tmp15515, tmp15519, 136);
__m512 tmp15532 = _mm512_shuffle_f32x4(tmp15515, tmp15519, 221);
__m512 tmp15533 = _mm512_shuffle_f32x4(tmp15516, tmp15520, 136);
__m512 tmp15534 = _mm512_shuffle_f32x4(tmp15516, tmp15520, 221);
__m512 tmp15535 = _mm512_shuffle_f32x4(tmp15517, tmp15521, 136);
__m512 tmp15536 = _mm512_shuffle_f32x4(tmp15517, tmp15521, 221);
__m512 tmp15537 = _mm512_shuffle_f32x4(tmp15518, tmp15522, 136);
__m512 tmp15538 = _mm512_shuffle_f32x4(tmp15518, tmp15522, 221);
wt753 = _mm512_shuffle_f32x4(tmp15523, tmp15531, 136);
wt761 = _mm512_shuffle_f32x4(tmp15523, tmp15531, 221);
wt754 = _mm512_shuffle_f32x4(tmp15525, tmp15533, 136);
wt762 = _mm512_shuffle_f32x4(tmp15525, tmp15533, 221);
wt755 = _mm512_shuffle_f32x4(tmp15527, tmp15535, 136);
wt763 = _mm512_shuffle_f32x4(tmp15527, tmp15535, 221);
wt756 = _mm512_shuffle_f32x4(tmp15529, tmp15537, 136);
wt764 = _mm512_shuffle_f32x4(tmp15529, tmp15537, 221);
wt757 = _mm512_shuffle_f32x4(tmp15524, tmp15532, 136);
wt765 = _mm512_shuffle_f32x4(tmp15524, tmp15532, 221);
wt758 = _mm512_shuffle_f32x4(tmp15526, tmp15534, 136);
wt766 = _mm512_shuffle_f32x4(tmp15526, tmp15534, 221);
wt759 = _mm512_shuffle_f32x4(tmp15528, tmp15536, 136);
wt767 = _mm512_shuffle_f32x4(tmp15528, tmp15536, 221);
wt760 = _mm512_shuffle_f32x4(tmp15530, tmp15538, 136);
wt768 = _mm512_shuffle_f32x4(tmp15530, tmp15538, 221);
wt753 = _mm512_mul_ps(wt753, postMul65);
wt754 = _mm512_mul_ps(wt754, postMul65);
wt755 = _mm512_mul_ps(wt755, postMul65);
wt756 = _mm512_mul_ps(wt756, postMul65);
wt757 = _mm512_mul_ps(wt757, postMul65);
wt758 = _mm512_mul_ps(wt758, postMul65);
wt759 = _mm512_mul_ps(wt759, postMul65);
wt760 = _mm512_mul_ps(wt760, postMul65);
wt761 = _mm512_mul_ps(wt761, postMul65);
wt762 = _mm512_mul_ps(wt762, postMul65);
wt763 = _mm512_mul_ps(wt763, postMul65);
wt764 = _mm512_mul_ps(wt764, postMul65);
wt765 = _mm512_mul_ps(wt765, postMul65);
wt766 = _mm512_mul_ps(wt766, postMul65);
wt767 = _mm512_mul_ps(wt767, postMul65);
wt768 = _mm512_mul_ps(wt768, postMul65);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(1+16*c61)+(ptrdiff_t)0, 63>>cut28, wt753);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(2+16*c61)+(ptrdiff_t)0, 63>>cut28, wt754);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(3+16*c61)+(ptrdiff_t)0, 63>>cut28, wt755);
_mm512_mask_storeu_ps(arranged23+83968