NN-512

Back

Index

Files

Top || Input graph file

Config Prefix=DenseNet121 Platform=AVX512Float32 L1DataCachePerThread=32KiB L2CachePerThreadExL1=960KiB L3CachePerThreadExL1L2=1408KiB
Input ToTensor=image Channels=3 Height=224 Width=224
Conv FromTensor=image ToTensor=sevenDS ToChannels=64 FilterH=7 FilterW=7 StrideH=2 StrideW=2 PaddingH=3 PaddingW=3 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=sevenDS ToTensor=bn1 Epsilon=0.00001
Activation FromTensor=bn1 ToTensor=relu1 Kind=ReLU Param=0
Pooling FromTensor=relu1 ToTensor=pool1 Kind=Max3x3Stride2 PaddingH=1 PaddingW=1
BatchNorm FromTensor=pool1 ToTensor=bn2 Epsilon=0.00001
Activation FromTensor=bn2 ToTensor=relu2 Kind=ReLU Param=0
Conv FromTensor=relu2 ToTensor=one1 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one1 ToTensor=bn3 Epsilon=0.00001
Activation FromTensor=bn3 ToTensor=relu3 Kind=ReLU Param=0
Conv FromTensor=relu3 ToTensor=three1 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=pool1 FromTensor2=three1 ToTensor=concat1
BatchNorm FromTensor=concat1 ToTensor=bn4 Epsilon=0.00001
Activation FromTensor=bn4 ToTensor=relu4 Kind=ReLU Param=0
Conv FromTensor=relu4 ToTensor=one2 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one2 ToTensor=bn5 Epsilon=0.00001
Activation FromTensor=bn5 ToTensor=relu5 Kind=ReLU Param=0
Conv FromTensor=relu5 ToTensor=three2 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat1 FromTensor2=three2 ToTensor=concat2
BatchNorm FromTensor=concat2 ToTensor=bn6 Epsilon=0.00001
Activation FromTensor=bn6 ToTensor=relu6 Kind=ReLU Param=0
Conv FromTensor=relu6 ToTensor=one3 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one3 ToTensor=bn7 Epsilon=0.00001
Activation FromTensor=bn7 ToTensor=relu7 Kind=ReLU Param=0
Conv FromTensor=relu7 ToTensor=three3 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat2 FromTensor2=three3 ToTensor=concat3
BatchNorm FromTensor=concat3 ToTensor=bn8 Epsilon=0.00001
Activation FromTensor=bn8 ToTensor=relu8 Kind=ReLU Param=0
Conv FromTensor=relu8 ToTensor=one4 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one4 ToTensor=bn9 Epsilon=0.00001
Activation FromTensor=bn9 ToTensor=relu9 Kind=ReLU Param=0
Conv FromTensor=relu9 ToTensor=three4 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat3 FromTensor2=three4 ToTensor=concat4
BatchNorm FromTensor=concat4 ToTensor=bn10 Epsilon=0.00001
Activation FromTensor=bn10 ToTensor=relu10 Kind=ReLU Param=0
Conv FromTensor=relu10 ToTensor=one5 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one5 ToTensor=bn11 Epsilon=0.00001
Activation FromTensor=bn11 ToTensor=relu11 Kind=ReLU Param=0
Conv FromTensor=relu11 ToTensor=three5 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat4 FromTensor2=three5 ToTensor=concat5
BatchNorm FromTensor=concat5 ToTensor=bn12 Epsilon=0.00001
Activation FromTensor=bn12 ToTensor=relu12 Kind=ReLU Param=0
Conv FromTensor=relu12 ToTensor=one6 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one6 ToTensor=bn13 Epsilon=0.00001
Activation FromTensor=bn13 ToTensor=relu13 Kind=ReLU Param=0
Conv FromTensor=relu13 ToTensor=three6 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat5 FromTensor2=three6 ToTensor=concat6
BatchNorm FromTensor=concat6 ToTensor=bn14 Epsilon=0.00001
Activation FromTensor=bn14 ToTensor=relu14 Kind=ReLU Param=0
Conv FromTensor=relu14 ToTensor=one7 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
Pooling FromTensor=one7 ToTensor=pool2 Kind=Avg2x2Stride2 PaddingH=0 PaddingW=0
BatchNorm FromTensor=pool2 ToTensor=bn15 Epsilon=0.00001
Activation FromTensor=bn15 ToTensor=relu15 Kind=ReLU Param=0
Conv FromTensor=relu15 ToTensor=one8 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one8 ToTensor=bn16 Epsilon=0.00001
Activation FromTensor=bn16 ToTensor=relu16 Kind=ReLU Param=0
Conv FromTensor=relu16 ToTensor=three7 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=pool2 FromTensor2=three7 ToTensor=concat7
BatchNorm FromTensor=concat7 ToTensor=bn17 Epsilon=0.00001
Activation FromTensor=bn17 ToTensor=relu17 Kind=ReLU Param=0
Conv FromTensor=relu17 ToTensor=one9 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one9 ToTensor=bn18 Epsilon=0.00001
Activation FromTensor=bn18 ToTensor=relu18 Kind=ReLU Param=0
Conv FromTensor=relu18 ToTensor=three8 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat7 FromTensor2=three8 ToTensor=concat8
BatchNorm FromTensor=concat8 ToTensor=bn19 Epsilon=0.00001
Activation FromTensor=bn19 ToTensor=relu19 Kind=ReLU Param=0
Conv FromTensor=relu19 ToTensor=one10 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one10 ToTensor=bn20 Epsilon=0.00001
Activation FromTensor=bn20 ToTensor=relu20 Kind=ReLU Param=0
Conv FromTensor=relu20 ToTensor=three9 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat8 FromTensor2=three9 ToTensor=concat9
BatchNorm FromTensor=concat9 ToTensor=bn21 Epsilon=0.00001
Activation FromTensor=bn21 ToTensor=relu21 Kind=ReLU Param=0
Conv FromTensor=relu21 ToTensor=one11 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one11 ToTensor=bn22 Epsilon=0.00001
Activation FromTensor=bn22 ToTensor=relu22 Kind=ReLU Param=0
Conv FromTensor=relu22 ToTensor=three10 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat9 FromTensor2=three10 ToTensor=concat10
BatchNorm FromTensor=concat10 ToTensor=bn23 Epsilon=0.00001
Activation FromTensor=bn23 ToTensor=relu23 Kind=ReLU Param=0
Conv FromTensor=relu23 ToTensor=one12 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one12 ToTensor=bn24 Epsilon=0.00001
Activation FromTensor=bn24 ToTensor=relu24 Kind=ReLU Param=0
Conv FromTensor=relu24 ToTensor=three11 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat10 FromTensor2=three11 ToTensor=concat11
BatchNorm FromTensor=concat11 ToTensor=bn25 Epsilon=0.00001
Activation FromTensor=bn25 ToTensor=relu25 Kind=ReLU Param=0
Conv FromTensor=relu25 ToTensor=one13 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one13 ToTensor=bn26 Epsilon=0.00001
Activation FromTensor=bn26 ToTensor=relu26 Kind=ReLU Param=0
Conv FromTensor=relu26 ToTensor=three12 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat11 FromTensor2=three12 ToTensor=concat12
BatchNorm FromTensor=concat12 ToTensor=bn27 Epsilon=0.00001
Activation FromTensor=bn27 ToTensor=relu27 Kind=ReLU Param=0
Conv FromTensor=relu27 ToTensor=one14 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one14 ToTensor=bn28 Epsilon=0.00001
Activation FromTensor=bn28 ToTensor=relu28 Kind=ReLU Param=0
Conv FromTensor=relu28 ToTensor=three13 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat12 FromTensor2=three13 ToTensor=concat13
BatchNorm FromTensor=concat13 ToTensor=bn29 Epsilon=0.00001
Activation FromTensor=bn29 ToTensor=relu29 Kind=ReLU Param=0
Conv FromTensor=relu29 ToTensor=one15 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one15 ToTensor=bn30 Epsilon=0.00001
Activation FromTensor=bn30 ToTensor=relu30 Kind=ReLU Param=0
Conv FromTensor=relu30 ToTensor=three14 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat13 FromTensor2=three14 ToTensor=concat14
BatchNorm FromTensor=concat14 ToTensor=bn31 Epsilon=0.00001
Activation FromTensor=bn31 ToTensor=relu31 Kind=ReLU Param=0
Conv FromTensor=relu31 ToTensor=one16 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one16 ToTensor=bn32 Epsilon=0.00001
Activation FromTensor=bn32 ToTensor=relu32 Kind=ReLU Param=0
Conv FromTensor=relu32 ToTensor=three15 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat14 FromTensor2=three15 ToTensor=concat15
BatchNorm FromTensor=concat15 ToTensor=bn33 Epsilon=0.00001
Activation FromTensor=bn33 ToTensor=relu33 Kind=ReLU Param=0
Conv FromTensor=relu33 ToTensor=one17 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one17 ToTensor=bn34 Epsilon=0.00001
Activation FromTensor=bn34 ToTensor=relu34 Kind=ReLU Param=0
Conv FromTensor=relu34 ToTensor=three16 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat15 FromTensor2=three16 ToTensor=concat16
BatchNorm FromTensor=concat16 ToTensor=bn35 Epsilon=0.00001
Activation FromTensor=bn35 ToTensor=relu35 Kind=ReLU Param=0
Conv FromTensor=relu35 ToTensor=one18 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one18 ToTensor=bn36 Epsilon=0.00001
Activation FromTensor=bn36 ToTensor=relu36 Kind=ReLU Param=0
Conv FromTensor=relu36 ToTensor=three17 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat16 FromTensor2=three17 ToTensor=concat17
BatchNorm FromTensor=concat17 ToTensor=bn37 Epsilon=0.00001
Activation FromTensor=bn37 ToTensor=relu37 Kind=ReLU Param=0
Conv FromTensor=relu37 ToTensor=one19 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one19 ToTensor=bn38 Epsilon=0.00001
Activation FromTensor=bn38 ToTensor=relu38 Kind=ReLU Param=0
Conv FromTensor=relu38 ToTensor=three18 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat17 FromTensor2=three18 ToTensor=concat18
BatchNorm FromTensor=concat18 ToTensor=bn39 Epsilon=0.00001
Activation FromTensor=bn39 ToTensor=relu39 Kind=ReLU Param=0
Conv FromTensor=relu39 ToTensor=one20 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
Pooling FromTensor=one20 ToTensor=pool3 Kind=Avg2x2Stride2 PaddingH=0 PaddingW=0
BatchNorm FromTensor=pool3 ToTensor=bn40 Epsilon=0.00001
Activation FromTensor=bn40 ToTensor=relu40 Kind=ReLU Param=0
Conv FromTensor=relu40 ToTensor=one21 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one21 ToTensor=bn41 Epsilon=0.00001
Activation FromTensor=bn41 ToTensor=relu41 Kind=ReLU Param=0
Conv FromTensor=relu41 ToTensor=three19 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=pool3 FromTensor2=three19 ToTensor=concat19
BatchNorm FromTensor=concat19 ToTensor=bn42 Epsilon=0.00001
Activation FromTensor=bn42 ToTensor=relu42 Kind=ReLU Param=0
Conv FromTensor=relu42 ToTensor=one22 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one22 ToTensor=bn43 Epsilon=0.00001
Activation FromTensor=bn43 ToTensor=relu43 Kind=ReLU Param=0
Conv FromTensor=relu43 ToTensor=three20 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat19 FromTensor2=three20 ToTensor=concat20
BatchNorm FromTensor=concat20 ToTensor=bn44 Epsilon=0.00001
Activation FromTensor=bn44 ToTensor=relu44 Kind=ReLU Param=0
Conv FromTensor=relu44 ToTensor=one23 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one23 ToTensor=bn45 Epsilon=0.00001
Activation FromTensor=bn45 ToTensor=relu45 Kind=ReLU Param=0
Conv FromTensor=relu45 ToTensor=three21 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat20 FromTensor2=three21 ToTensor=concat21
BatchNorm FromTensor=concat21 ToTensor=bn46 Epsilon=0.00001
Activation FromTensor=bn46 ToTensor=relu46 Kind=ReLU Param=0
Conv FromTensor=relu46 ToTensor=one24 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one24 ToTensor=bn47 Epsilon=0.00001
Activation FromTensor=bn47 ToTensor=relu47 Kind=ReLU Param=0
Conv FromTensor=relu47 ToTensor=three22 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat21 FromTensor2=three22 ToTensor=concat22
BatchNorm FromTensor=concat22 ToTensor=bn48 Epsilon=0.00001
Activation FromTensor=bn48 ToTensor=relu48 Kind=ReLU Param=0
Conv FromTensor=relu48 ToTensor=one25 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one25 ToTensor=bn49 Epsilon=0.00001
Activation FromTensor=bn49 ToTensor=relu49 Kind=ReLU Param=0
Conv FromTensor=relu49 ToTensor=three23 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat22 FromTensor2=three23 ToTensor=concat23
BatchNorm FromTensor=concat23 ToTensor=bn50 Epsilon=0.00001
Activation FromTensor=bn50 ToTensor=relu50 Kind=ReLU Param=0
Conv FromTensor=relu50 ToTensor=one26 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one26 ToTensor=bn51 Epsilon=0.00001
Activation FromTensor=bn51 ToTensor=relu51 Kind=ReLU Param=0
Conv FromTensor=relu51 ToTensor=three24 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat23 FromTensor2=three24 ToTensor=concat24
BatchNorm FromTensor=concat24 ToTensor=bn52 Epsilon=0.00001
Activation FromTensor=bn52 ToTensor=relu52 Kind=ReLU Param=0
Conv FromTensor=relu52 ToTensor=one27 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one27 ToTensor=bn53 Epsilon=0.00001
Activation FromTensor=bn53 ToTensor=relu53 Kind=ReLU Param=0
Conv FromTensor=relu53 ToTensor=three25 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat24 FromTensor2=three25 ToTensor=concat25
BatchNorm FromTensor=concat25 ToTensor=bn54 Epsilon=0.00001
Activation FromTensor=bn54 ToTensor=relu54 Kind=ReLU Param=0
Conv FromTensor=relu54 ToTensor=one28 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one28 ToTensor=bn55 Epsilon=0.00001
Activation FromTensor=bn55 ToTensor=relu55 Kind=ReLU Param=0
Conv FromTensor=relu55 ToTensor=three26 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat25 FromTensor2=three26 ToTensor=concat26
BatchNorm FromTensor=concat26 ToTensor=bn56 Epsilon=0.00001
Activation FromTensor=bn56 ToTensor=relu56 Kind=ReLU Param=0
Conv FromTensor=relu56 ToTensor=one29 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one29 ToTensor=bn57 Epsilon=0.00001
Activation FromTensor=bn57 ToTensor=relu57 Kind=ReLU Param=0
Conv FromTensor=relu57 ToTensor=three27 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat26 FromTensor2=three27 ToTensor=concat27
BatchNorm FromTensor=concat27 ToTensor=bn58 Epsilon=0.00001
Activation FromTensor=bn58 ToTensor=relu58 Kind=ReLU Param=0
Conv FromTensor=relu58 ToTensor=one30 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one30 ToTensor=bn59 Epsilon=0.00001
Activation FromTensor=bn59 ToTensor=relu59 Kind=ReLU Param=0
Conv FromTensor=relu59 ToTensor=three28 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat27 FromTensor2=three28 ToTensor=concat28
BatchNorm FromTensor=concat28 ToTensor=bn60 Epsilon=0.00001
Activation FromTensor=bn60 ToTensor=relu60 Kind=ReLU Param=0
Conv FromTensor=relu60 ToTensor=one31 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one31 ToTensor=bn61 Epsilon=0.00001
Activation FromTensor=bn61 ToTensor=relu61 Kind=ReLU Param=0
Conv FromTensor=relu61 ToTensor=three29 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat28 FromTensor2=three29 ToTensor=concat29
BatchNorm FromTensor=concat29 ToTensor=bn62 Epsilon=0.00001
Activation FromTensor=bn62 ToTensor=relu62 Kind=ReLU Param=0
Conv FromTensor=relu62 ToTensor=one32 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one32 ToTensor=bn63 Epsilon=0.00001
Activation FromTensor=bn63 ToTensor=relu63 Kind=ReLU Param=0
Conv FromTensor=relu63 ToTensor=three30 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat29 FromTensor2=three30 ToTensor=concat30
BatchNorm FromTensor=concat30 ToTensor=bn64 Epsilon=0.00001
Activation FromTensor=bn64 ToTensor=relu64 Kind=ReLU Param=0
Conv FromTensor=relu64 ToTensor=one33 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one33 ToTensor=bn65 Epsilon=0.00001
Activation FromTensor=bn65 ToTensor=relu65 Kind=ReLU Param=0
Conv FromTensor=relu65 ToTensor=three31 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat30 FromTensor2=three31 ToTensor=concat31
BatchNorm FromTensor=concat31 ToTensor=bn66 Epsilon=0.00001
Activation FromTensor=bn66 ToTensor=relu66 Kind=ReLU Param=0
Conv FromTensor=relu66 ToTensor=one34 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one34 ToTensor=bn67 Epsilon=0.00001
Activation FromTensor=bn67 ToTensor=relu67 Kind=ReLU Param=0
Conv FromTensor=relu67 ToTensor=three32 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat31 FromTensor2=three32 ToTensor=concat32
BatchNorm FromTensor=concat32 ToTensor=bn68 Epsilon=0.00001
Activation FromTensor=bn68 ToTensor=relu68 Kind=ReLU Param=0
Conv FromTensor=relu68 ToTensor=one35 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one35 ToTensor=bn69 Epsilon=0.00001
Activation FromTensor=bn69 ToTensor=relu69 Kind=ReLU Param=0
Conv FromTensor=relu69 ToTensor=three33 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat32 FromTensor2=three33 ToTensor=concat33
BatchNorm FromTensor=concat33 ToTensor=bn70 Epsilon=0.00001
Activation FromTensor=bn70 ToTensor=relu70 Kind=ReLU Param=0
Conv FromTensor=relu70 ToTensor=one36 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one36 ToTensor=bn71 Epsilon=0.00001
Activation FromTensor=bn71 ToTensor=relu71 Kind=ReLU Param=0
Conv FromTensor=relu71 ToTensor=three34 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat33 FromTensor2=three34 ToTensor=concat34
BatchNorm FromTensor=concat34 ToTensor=bn72 Epsilon=0.00001
Activation FromTensor=bn72 ToTensor=relu72 Kind=ReLU Param=0
Conv FromTensor=relu72 ToTensor=one37 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one37 ToTensor=bn73 Epsilon=0.00001
Activation FromTensor=bn73 ToTensor=relu73 Kind=ReLU Param=0
Conv FromTensor=relu73 ToTensor=three35 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat34 FromTensor2=three35 ToTensor=concat35
BatchNorm FromTensor=concat35 ToTensor=bn74 Epsilon=0.00001
Activation FromTensor=bn74 ToTensor=relu74 Kind=ReLU Param=0
Conv FromTensor=relu74 ToTensor=one38 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one38 ToTensor=bn75 Epsilon=0.00001
Activation FromTensor=bn75 ToTensor=relu75 Kind=ReLU Param=0
Conv FromTensor=relu75 ToTensor=three36 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat35 FromTensor2=three36 ToTensor=concat36
BatchNorm FromTensor=concat36 ToTensor=bn76 Epsilon=0.00001
Activation FromTensor=bn76 ToTensor=relu76 Kind=ReLU Param=0
Conv FromTensor=relu76 ToTensor=one39 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one39 ToTensor=bn77 Epsilon=0.00001
Activation FromTensor=bn77 ToTensor=relu77 Kind=ReLU Param=0
Conv FromTensor=relu77 ToTensor=three37 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat36 FromTensor2=three37 ToTensor=concat37
BatchNorm FromTensor=concat37 ToTensor=bn78 Epsilon=0.00001
Activation FromTensor=bn78 ToTensor=relu78 Kind=ReLU Param=0
Conv FromTensor=relu78 ToTensor=one40 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one40 ToTensor=bn79 Epsilon=0.00001
Activation FromTensor=bn79 ToTensor=relu79 Kind=ReLU Param=0
Conv FromTensor=relu79 ToTensor=three38 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat37 FromTensor2=three38 ToTensor=concat38
BatchNorm FromTensor=concat38 ToTensor=bn80 Epsilon=0.00001
Activation FromTensor=bn80 ToTensor=relu80 Kind=ReLU Param=0
Conv FromTensor=relu80 ToTensor=one41 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one41 ToTensor=bn81 Epsilon=0.00001
Activation FromTensor=bn81 ToTensor=relu81 Kind=ReLU Param=0
Conv FromTensor=relu81 ToTensor=three39 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat38 FromTensor2=three39 ToTensor=concat39
BatchNorm FromTensor=concat39 ToTensor=bn82 Epsilon=0.00001
Activation FromTensor=bn82 ToTensor=relu82 Kind=ReLU Param=0
Conv FromTensor=relu82 ToTensor=one42 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one42 ToTensor=bn83 Epsilon=0.00001
Activation FromTensor=bn83 ToTensor=relu83 Kind=ReLU Param=0
Conv FromTensor=relu83 ToTensor=three40 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat39 FromTensor2=three40 ToTensor=concat40
BatchNorm FromTensor=concat40 ToTensor=bn84 Epsilon=0.00001
Activation FromTensor=bn84 ToTensor=relu84 Kind=ReLU Param=0
Conv FromTensor=relu84 ToTensor=one43 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one43 ToTensor=bn85 Epsilon=0.00001
Activation FromTensor=bn85 ToTensor=relu85 Kind=ReLU Param=0
Conv FromTensor=relu85 ToTensor=three41 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat40 FromTensor2=three41 ToTensor=concat41
BatchNorm FromTensor=concat41 ToTensor=bn86 Epsilon=0.00001
Activation FromTensor=bn86 ToTensor=relu86 Kind=ReLU Param=0
Conv FromTensor=relu86 ToTensor=one44 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one44 ToTensor=bn87 Epsilon=0.00001
Activation FromTensor=bn87 ToTensor=relu87 Kind=ReLU Param=0
Conv FromTensor=relu87 ToTensor=three42 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat41 FromTensor2=three42 ToTensor=concat42
BatchNorm FromTensor=concat42 ToTensor=bn88 Epsilon=0.00001
Activation FromTensor=bn88 ToTensor=relu88 Kind=ReLU Param=0
Conv FromTensor=relu88 ToTensor=one45 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
Pooling FromTensor=one45 ToTensor=pool4 Kind=Avg2x2Stride2 PaddingH=0 PaddingW=0
BatchNorm FromTensor=pool4 ToTensor=bn89 Epsilon=0.00001
Activation FromTensor=bn89 ToTensor=relu89 Kind=ReLU Param=0
Conv FromTensor=relu89 ToTensor=one46 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one46 ToTensor=bn90 Epsilon=0.00001
Activation FromTensor=bn90 ToTensor=relu90 Kind=ReLU Param=0
Conv FromTensor=relu90 ToTensor=three43 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=pool4 FromTensor2=three43 ToTensor=concat43
BatchNorm FromTensor=concat43 ToTensor=bn91 Epsilon=0.00001
Activation FromTensor=bn91 ToTensor=relu91 Kind=ReLU Param=0
Conv FromTensor=relu91 ToTensor=one47 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one47 ToTensor=bn92 Epsilon=0.00001
Activation FromTensor=bn92 ToTensor=relu92 Kind=ReLU Param=0
Conv FromTensor=relu92 ToTensor=three44 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat43 FromTensor2=three44 ToTensor=concat44
BatchNorm FromTensor=concat44 ToTensor=bn93 Epsilon=0.00001
Activation FromTensor=bn93 ToTensor=relu93 Kind=ReLU Param=0
Conv FromTensor=relu93 ToTensor=one48 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one48 ToTensor=bn94 Epsilon=0.00001
Activation FromTensor=bn94 ToTensor=relu94 Kind=ReLU Param=0
Conv FromTensor=relu94 ToTensor=three45 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat44 FromTensor2=three45 ToTensor=concat45
BatchNorm FromTensor=concat45 ToTensor=bn95 Epsilon=0.00001
Activation FromTensor=bn95 ToTensor=relu95 Kind=ReLU Param=0
Conv FromTensor=relu95 ToTensor=one49 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one49 ToTensor=bn96 Epsilon=0.00001
Activation FromTensor=bn96 ToTensor=relu96 Kind=ReLU Param=0
Conv FromTensor=relu96 ToTensor=three46 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat45 FromTensor2=three46 ToTensor=concat46
BatchNorm FromTensor=concat46 ToTensor=bn97 Epsilon=0.00001
Activation FromTensor=bn97 ToTensor=relu97 Kind=ReLU Param=0
Conv FromTensor=relu97 ToTensor=one50 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one50 ToTensor=bn98 Epsilon=0.00001
Activation FromTensor=bn98 ToTensor=relu98 Kind=ReLU Param=0
Conv FromTensor=relu98 ToTensor=three47 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat46 FromTensor2=three47 ToTensor=concat47
BatchNorm FromTensor=concat47 ToTensor=bn99 Epsilon=0.00001
Activation FromTensor=bn99 ToTensor=relu99 Kind=ReLU Param=0
Conv FromTensor=relu99 ToTensor=one51 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one51 ToTensor=bn100 Epsilon=0.00001
Activation FromTensor=bn100 ToTensor=relu100 Kind=ReLU Param=0
Conv FromTensor=relu100 ToTensor=three48 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat47 FromTensor2=three48 ToTensor=concat48
BatchNorm FromTensor=concat48 ToTensor=bn101 Epsilon=0.00001
Activation FromTensor=bn101 ToTensor=relu101 Kind=ReLU Param=0
Conv FromTensor=relu101 ToTensor=one52 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one52 ToTensor=bn102 Epsilon=0.00001
Activation FromTensor=bn102 ToTensor=relu102 Kind=ReLU Param=0
Conv FromTensor=relu102 ToTensor=three49 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat48 FromTensor2=three49 ToTensor=concat49
BatchNorm FromTensor=concat49 ToTensor=bn103 Epsilon=0.00001
Activation FromTensor=bn103 ToTensor=relu103 Kind=ReLU Param=0
Conv FromTensor=relu103 ToTensor=one53 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one53 ToTensor=bn104 Epsilon=0.00001
Activation FromTensor=bn104 ToTensor=relu104 Kind=ReLU Param=0
Conv FromTensor=relu104 ToTensor=three50 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat49 FromTensor2=three50 ToTensor=concat50
BatchNorm FromTensor=concat50 ToTensor=bn105 Epsilon=0.00001
Activation FromTensor=bn105 ToTensor=relu105 Kind=ReLU Param=0
Conv FromTensor=relu105 ToTensor=one54 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one54 ToTensor=bn106 Epsilon=0.00001
Activation FromTensor=bn106 ToTensor=relu106 Kind=ReLU Param=0
Conv FromTensor=relu106 ToTensor=three51 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat50 FromTensor2=three51 ToTensor=concat51
BatchNorm FromTensor=concat51 ToTensor=bn107 Epsilon=0.00001
Activation FromTensor=bn107 ToTensor=relu107 Kind=ReLU Param=0
Conv FromTensor=relu107 ToTensor=one55 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one55 ToTensor=bn108 Epsilon=0.00001
Activation FromTensor=bn108 ToTensor=relu108 Kind=ReLU Param=0
Conv FromTensor=relu108 ToTensor=three52 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat51 FromTensor2=three52 ToTensor=concat52
BatchNorm FromTensor=concat52 ToTensor=bn109 Epsilon=0.00001
Activation FromTensor=bn109 ToTensor=relu109 Kind=ReLU Param=0
Conv FromTensor=relu109 ToTensor=one56 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one56 ToTensor=bn110 Epsilon=0.00001
Activation FromTensor=bn110 ToTensor=relu110 Kind=ReLU Param=0
Conv FromTensor=relu110 ToTensor=three53 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat52 FromTensor2=three53 ToTensor=concat53
BatchNorm FromTensor=concat53 ToTensor=bn111 Epsilon=0.00001
Activation FromTensor=bn111 ToTensor=relu111 Kind=ReLU Param=0
Conv FromTensor=relu111 ToTensor=one57 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one57 ToTensor=bn112 Epsilon=0.00001
Activation FromTensor=bn112 ToTensor=relu112 Kind=ReLU Param=0
Conv FromTensor=relu112 ToTensor=three54 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat53 FromTensor2=three54 ToTensor=concat54
BatchNorm FromTensor=concat54 ToTensor=bn113 Epsilon=0.00001
Activation FromTensor=bn113 ToTensor=relu113 Kind=ReLU Param=0
Conv FromTensor=relu113 ToTensor=one58 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one58 ToTensor=bn114 Epsilon=0.00001
Activation FromTensor=bn114 ToTensor=relu114 Kind=ReLU Param=0
Conv FromTensor=relu114 ToTensor=three55 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat54 FromTensor2=three55 ToTensor=concat55
BatchNorm FromTensor=concat55 ToTensor=bn115 Epsilon=0.00001
Activation FromTensor=bn115 ToTensor=relu115 Kind=ReLU Param=0
Conv FromTensor=relu115 ToTensor=one59 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one59 ToTensor=bn116 Epsilon=0.00001
Activation FromTensor=bn116 ToTensor=relu116 Kind=ReLU Param=0
Conv FromTensor=relu116 ToTensor=three56 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat55 FromTensor2=three56 ToTensor=concat56
BatchNorm FromTensor=concat56 ToTensor=bn117 Epsilon=0.00001
Activation FromTensor=bn117 ToTensor=relu117 Kind=ReLU Param=0
Conv FromTensor=relu117 ToTensor=one60 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one60 ToTensor=bn118 Epsilon=0.00001
Activation FromTensor=bn118 ToTensor=relu118 Kind=ReLU Param=0
Conv FromTensor=relu118 ToTensor=three57 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat56 FromTensor2=three57 ToTensor=concat57
BatchNorm FromTensor=concat57 ToTensor=bn119 Epsilon=0.00001
Activation FromTensor=bn119 ToTensor=relu119 Kind=ReLU Param=0
Conv FromTensor=relu119 ToTensor=one61 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one61 ToTensor=bn120 Epsilon=0.00001
Activation FromTensor=bn120 ToTensor=relu120 Kind=ReLU Param=0
Conv FromTensor=relu120 ToTensor=three58 ToChannels=32 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
Concat FromTensor1=concat57 FromTensor2=three58 ToTensor=concat58
BatchNorm FromTensor=concat58 ToTensor=bn121 Epsilon=0.00001
Activation FromTensor=bn121 ToTensor=relu121 Kind=ReLU Param=0
Pooling FromTensor=relu121 ToTensor=pool5 Kind=AvgGlobal PaddingH=0 PaddingW=0
FullyConnected FromTensor=pool5 ToTensor=fc ToChannels=1000
Softmax FromTensor=fc ToTensor=prob
Output FromTensor=prob

Top || Output DenseNet121.h file

#pragma once

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <pthread.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" { /**/
#endif

// All weights, biases, and other trained parameters are passed into
// the initialization code through the Params struct that is declared
// just below this comment. The corresponding struct definition can be
// found near the end of this header file.
//
// Each field of the Params struct is an array of float that holds a
// parameter tensor in NCHW format with no padding. The struct fields
// are ordered by name, lexically bytewise. If you concatenate all the
// trained parameter tensors to a file in this same format and order
// you can load the struct as follows (error checking omitted here):
//
// size_t size = sizeof(DenseNet121Params);
// DenseNet121Params* to = malloc(size);
// FILE* from = fopen("ParamsFile", "r");
// fread(to, size, 1, from);
// fclose(from);
//
// Be careful to match endianness (and floating point format).

typedef struct DenseNet121Params DenseNet121Params;

// The Net contains weights, biases, and other trained parameters in a
// form that enables efficient inference. It is created from the input
// parameter struct without modifying that struct. The input parameter
// struct is no longer needed once the Net has been created. Threads
// that are used to create the Net are temporary (in particular, those
// threads are not used for inference).
//
// DenseNet121Params* params = malloc(sizeof(DenseNet121Params));
//
// ... Load params (read from a file, perhaps) ...
//
// DenseNet121Net* net; // For example, 4 threads:
// char* err = DenseNet121NetCreate(&net, params, 4);
// free(params);
//
// if (err) { // Nonzero err indicates failure; net is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Perform all inference that depends on net ...
//
// DenseNet121NetDestroy(net);
//
// The Net can be shared and reused without restriction because it is
// never modified (not even temporarily) after being created. The Net
// should be destroyed (to free memory) once all dependent inference
// is complete.

typedef struct DenseNet121Net DenseNet121Net;

char* DenseNet121NetCreate(
DenseNet121Net**,
DenseNet121Params*,
ptrdiff_t threads
);

void DenseNet121NetDestroy(DenseNet121Net*);

// An Engine performs inference. It contains inference threads, scratch
// memory, and a pointer to the Net. Any number of Engines can share the
// same Net (and perform inference in parallel) because the Net is never
// modified. For best performance the number of inference threads should
// not exceed the number of CPU cores.
//
// DenseNet121Net* net;
//
// ... Create net ...
//
// DenseNet121Engine* engine; // For example, 4 inference threads:
// char* err = DenseNet121EngineCreate(&engine, net, 4);
//
// if (err) { // Nonzero err means failure; engine is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
//
// ... Destroy net ...
//
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Use the POSIX threads API to adjust engine's threads ...
// ... Use engine to perform inference (dependent on net) ...
//
// DenseNet121EngineDestroy(engine); // Terminate threads, free memory.
//
// ... Destroy net ...
//
// The POSIX threads API can be used to adjust an Engine's threads. If
// an Engine has N threads, those threads are indexed 0, 1, 2, ..., N-1
// and a pthread_t identifier is associated with each index. To set the
// CPU affinity mask for the first inference thread, for example:
//
// pthread_t thread; // The first thread has index 0:
// char* err = DenseNet121EnginePthreadT(engine, 0, &thread);
//
// assert(!err); // Can only fail if the thread index is invalid.
//
// pthread_setaffinity_np(thread, ...); // Details omitted.
//
// The inference function reads floats from (one or more) input tensors
// and writes floats to (one or more) output tensors. All the input and
// output tensors are owned (allocated and freed) by the caller and are
// in CHW format, 32-bit floating point, fully packed (in other words,
// C has the largest pitch, W has the smallest pitch, and there is no
// padding anywhere).
//
// float* imageData = malloc(sizeof(float)*3*224*224);
// float* probData = malloc(sizeof(float)*1000*1*1);
//
// for (...) { // Reuse the input and output tensors.
//
// ... Write the input floats ...
//
// DenseNet121EngineInference( // This function cannot fail.
// engine, // Pass an Engine as the first argument.
// imageData, // The tensor arguments are sorted by name.
// probData
// );
//
// ... Read the output floats ...
//
// }
//
// free(imageData);
// free(probData);
//
// The tensor parameters of the inference function are ordered by name,
// lexically bytewise. In other words, the function parameters have been
// sorted by name using Go's "<" string comparison operator (a bytewise
// lexical string sort).

typedef struct DenseNet121Engine DenseNet121Engine;

char* DenseNet121EngineCreate(
DenseNet121Engine**,
DenseNet121Net*,
ptrdiff_t threads
);

char* DenseNet121EnginePthreadT(
DenseNet121Engine*,
ptrdiff_t threadIdx,
pthread_t* to
);

void DenseNet121EngineInference(
DenseNet121Engine*,
float* imageData,
float* probData
);

void DenseNet121EngineDestroy(DenseNet121Engine*);

// The fields of the following struct have been sorted by name using
// Go's "<" string comparison operator (bytewise lexical string sort).
// Tensor dimensions are NxCxHxW where N is the outermost/slowest and
// W is the innermost/fastest. There is no padding anywhere.

struct DenseNet121Params {
float bn100Means[128]; // 1x128x1x1
float bn100Scales[128]; // 1x128x1x1
float bn100Shifts[128]; // 1x128x1x1
float bn100Variances[128]; // 1x128x1x1
float bn101Means[704]; // 1x704x1x1
float bn101Scales[704]; // 1x704x1x1
float bn101Shifts[704]; // 1x704x1x1
float bn101Variances[704]; // 1x704x1x1
float bn102Means[128]; // 1x128x1x1
float bn102Scales[128]; // 1x128x1x1
float bn102Shifts[128]; // 1x128x1x1
float bn102Variances[128]; // 1x128x1x1
float bn103Means[736]; // 1x736x1x1
float bn103Scales[736]; // 1x736x1x1
float bn103Shifts[736]; // 1x736x1x1
float bn103Variances[736]; // 1x736x1x1
float bn104Means[128]; // 1x128x1x1
float bn104Scales[128]; // 1x128x1x1
float bn104Shifts[128]; // 1x128x1x1
float bn104Variances[128]; // 1x128x1x1
float bn105Means[768]; // 1x768x1x1
float bn105Scales[768]; // 1x768x1x1
float bn105Shifts[768]; // 1x768x1x1
float bn105Variances[768]; // 1x768x1x1
float bn106Means[128]; // 1x128x1x1
float bn106Scales[128]; // 1x128x1x1
float bn106Shifts[128]; // 1x128x1x1
float bn106Variances[128]; // 1x128x1x1
float bn107Means[800]; // 1x800x1x1
float bn107Scales[800]; // 1x800x1x1
float bn107Shifts[800]; // 1x800x1x1
float bn107Variances[800]; // 1x800x1x1
float bn108Means[128]; // 1x128x1x1
float bn108Scales[128]; // 1x128x1x1
float bn108Shifts[128]; // 1x128x1x1
float bn108Variances[128]; // 1x128x1x1
float bn109Means[832]; // 1x832x1x1
float bn109Scales[832]; // 1x832x1x1
float bn109Shifts[832]; // 1x832x1x1
float bn109Variances[832]; // 1x832x1x1
float bn10Means[192]; // 1x192x1x1
float bn10Scales[192]; // 1x192x1x1
float bn10Shifts[192]; // 1x192x1x1
float bn10Variances[192]; // 1x192x1x1
float bn110Means[128]; // 1x128x1x1
float bn110Scales[128]; // 1x128x1x1
float bn110Shifts[128]; // 1x128x1x1
float bn110Variances[128]; // 1x128x1x1
float bn111Means[864]; // 1x864x1x1
float bn111Scales[864]; // 1x864x1x1
float bn111Shifts[864]; // 1x864x1x1
float bn111Variances[864]; // 1x864x1x1
float bn112Means[128]; // 1x128x1x1
float bn112Scales[128]; // 1x128x1x1
float bn112Shifts[128]; // 1x128x1x1
float bn112Variances[128]; // 1x128x1x1
float bn113Means[896]; // 1x896x1x1
float bn113Scales[896]; // 1x896x1x1
float bn113Shifts[896]; // 1x896x1x1
float bn113Variances[896]; // 1x896x1x1
float bn114Means[128]; // 1x128x1x1
float bn114Scales[128]; // 1x128x1x1
float bn114Shifts[128]; // 1x128x1x1
float bn114Variances[128]; // 1x128x1x1
float bn115Means[928]; // 1x928x1x1
float bn115Scales[928]; // 1x928x1x1
float bn115Shifts[928]; // 1x928x1x1
float bn115Variances[928]; // 1x928x1x1
float bn116Means[128]; // 1x128x1x1
float bn116Scales[128]; // 1x128x1x1
float bn116Shifts[128]; // 1x128x1x1
float bn116Variances[128]; // 1x128x1x1
float bn117Means[960]; // 1x960x1x1
float bn117Scales[960]; // 1x960x1x1
float bn117Shifts[960]; // 1x960x1x1
float bn117Variances[960]; // 1x960x1x1
float bn118Means[128]; // 1x128x1x1
float bn118Scales[128]; // 1x128x1x1
float bn118Shifts[128]; // 1x128x1x1
float bn118Variances[128]; // 1x128x1x1
float bn119Means[992]; // 1x992x1x1
float bn119Scales[992]; // 1x992x1x1
float bn119Shifts[992]; // 1x992x1x1
float bn119Variances[992]; // 1x992x1x1
float bn11Means[128]; // 1x128x1x1
float bn11Scales[128]; // 1x128x1x1
float bn11Shifts[128]; // 1x128x1x1
float bn11Variances[128]; // 1x128x1x1
float bn120Means[128]; // 1x128x1x1
float bn120Scales[128]; // 1x128x1x1
float bn120Shifts[128]; // 1x128x1x1
float bn120Variances[128]; // 1x128x1x1
float bn121Means[1024]; // 1x1024x1x1
float bn121Scales[1024]; // 1x1024x1x1
float bn121Shifts[1024]; // 1x1024x1x1
float bn121Variances[1024]; // 1x1024x1x1
float bn12Means[224]; // 1x224x1x1
float bn12Scales[224]; // 1x224x1x1
float bn12Shifts[224]; // 1x224x1x1
float bn12Variances[224]; // 1x224x1x1
float bn13Means[128]; // 1x128x1x1
float bn13Scales[128]; // 1x128x1x1
float bn13Shifts[128]; // 1x128x1x1
float bn13Variances[128]; // 1x128x1x1
float bn14Means[256]; // 1x256x1x1
float bn14Scales[256]; // 1x256x1x1
float bn14Shifts[256]; // 1x256x1x1
float bn14Variances[256]; // 1x256x1x1
float bn15Means[128]; // 1x128x1x1
float bn15Scales[128]; // 1x128x1x1
float bn15Shifts[128]; // 1x128x1x1
float bn15Variances[128]; // 1x128x1x1
float bn16Means[128]; // 1x128x1x1
float bn16Scales[128]; // 1x128x1x1
float bn16Shifts[128]; // 1x128x1x1
float bn16Variances[128]; // 1x128x1x1
float bn17Means[160]; // 1x160x1x1
float bn17Scales[160]; // 1x160x1x1
float bn17Shifts[160]; // 1x160x1x1
float bn17Variances[160]; // 1x160x1x1
float bn18Means[128]; // 1x128x1x1
float bn18Scales[128]; // 1x128x1x1
float bn18Shifts[128]; // 1x128x1x1
float bn18Variances[128]; // 1x128x1x1
float bn19Means[192]; // 1x192x1x1
float bn19Scales[192]; // 1x192x1x1
float bn19Shifts[192]; // 1x192x1x1
float bn19Variances[192]; // 1x192x1x1
float bn1Means[64]; // 1x64x1x1
float bn1Scales[64]; // 1x64x1x1
float bn1Shifts[64]; // 1x64x1x1
float bn1Variances[64]; // 1x64x1x1
float bn20Means[128]; // 1x128x1x1
float bn20Scales[128]; // 1x128x1x1
float bn20Shifts[128]; // 1x128x1x1
float bn20Variances[128]; // 1x128x1x1
float bn21Means[224]; // 1x224x1x1
float bn21Scales[224]; // 1x224x1x1
float bn21Shifts[224]; // 1x224x1x1
float bn21Variances[224]; // 1x224x1x1
float bn22Means[128]; // 1x128x1x1
float bn22Scales[128]; // 1x128x1x1
float bn22Shifts[128]; // 1x128x1x1
float bn22Variances[128]; // 1x128x1x1
float bn23Means[256]; // 1x256x1x1
float bn23Scales[256]; // 1x256x1x1
float bn23Shifts[256]; // 1x256x1x1
float bn23Variances[256]; // 1x256x1x1
float bn24Means[128]; // 1x128x1x1
float bn24Scales[128]; // 1x128x1x1
float bn24Shifts[128]; // 1x128x1x1
float bn24Variances[128]; // 1x128x1x1
float bn25Means[288]; // 1x288x1x1
float bn25Scales[288]; // 1x288x1x1
float bn25Shifts[288]; // 1x288x1x1
float bn25Variances[288]; // 1x288x1x1
float bn26Means[128]; // 1x128x1x1
float bn26Scales[128]; // 1x128x1x1
float bn26Shifts[128]; // 1x128x1x1
float bn26Variances[128]; // 1x128x1x1
float bn27Means[320]; // 1x320x1x1
float bn27Scales[320]; // 1x320x1x1
float bn27Shifts[320]; // 1x320x1x1
float bn27Variances[320]; // 1x320x1x1
float bn28Means[128]; // 1x128x1x1
float bn28Scales[128]; // 1x128x1x1
float bn28Shifts[128]; // 1x128x1x1
float bn28Variances[128]; // 1x128x1x1
float bn29Means[352]; // 1x352x1x1
float bn29Scales[352]; // 1x352x1x1
float bn29Shifts[352]; // 1x352x1x1
float bn29Variances[352]; // 1x352x1x1
float bn2Means[64]; // 1x64x1x1
float bn2Scales[64]; // 1x64x1x1
float bn2Shifts[64]; // 1x64x1x1
float bn2Variances[64]; // 1x64x1x1
float bn30Means[128]; // 1x128x1x1
float bn30Scales[128]; // 1x128x1x1
float bn30Shifts[128]; // 1x128x1x1
float bn30Variances[128]; // 1x128x1x1
float bn31Means[384]; // 1x384x1x1
float bn31Scales[384]; // 1x384x1x1
float bn31Shifts[384]; // 1x384x1x1
float bn31Variances[384]; // 1x384x1x1
float bn32Means[128]; // 1x128x1x1
float bn32Scales[128]; // 1x128x1x1
float bn32Shifts[128]; // 1x128x1x1
float bn32Variances[128]; // 1x128x1x1
float bn33Means[416]; // 1x416x1x1
float bn33Scales[416]; // 1x416x1x1
float bn33Shifts[416]; // 1x416x1x1
float bn33Variances[416]; // 1x416x1x1
float bn34Means[128]; // 1x128x1x1
float bn34Scales[128]; // 1x128x1x1
float bn34Shifts[128]; // 1x128x1x1
float bn34Variances[128]; // 1x128x1x1
float bn35Means[448]; // 1x448x1x1
float bn35Scales[448]; // 1x448x1x1
float bn35Shifts[448]; // 1x448x1x1
float bn35Variances[448]; // 1x448x1x1
float bn36Means[128]; // 1x128x1x1
float bn36Scales[128]; // 1x128x1x1
float bn36Shifts[128]; // 1x128x1x1
float bn36Variances[128]; // 1x128x1x1
float bn37Means[480]; // 1x480x1x1
float bn37Scales[480]; // 1x480x1x1
float bn37Shifts[480]; // 1x480x1x1
float bn37Variances[480]; // 1x480x1x1
float bn38Means[128]; // 1x128x1x1
float bn38Scales[128]; // 1x128x1x1
float bn38Shifts[128]; // 1x128x1x1
float bn38Variances[128]; // 1x128x1x1
float bn39Means[512]; // 1x512x1x1
float bn39Scales[512]; // 1x512x1x1
float bn39Shifts[512]; // 1x512x1x1
float bn39Variances[512]; // 1x512x1x1
float bn3Means[128]; // 1x128x1x1
float bn3Scales[128]; // 1x128x1x1
float bn3Shifts[128]; // 1x128x1x1
float bn3Variances[128]; // 1x128x1x1
float bn40Means[256]; // 1x256x1x1
float bn40Scales[256]; // 1x256x1x1
float bn40Shifts[256]; // 1x256x1x1
float bn40Variances[256]; // 1x256x1x1
float bn41Means[128]; // 1x128x1x1
float bn41Scales[128]; // 1x128x1x1
float bn41Shifts[128]; // 1x128x1x1
float bn41Variances[128]; // 1x128x1x1
float bn42Means[288]; // 1x288x1x1
float bn42Scales[288]; // 1x288x1x1
float bn42Shifts[288]; // 1x288x1x1
float bn42Variances[288]; // 1x288x1x1
float bn43Means[128]; // 1x128x1x1
float bn43Scales[128]; // 1x128x1x1
float bn43Shifts[128]; // 1x128x1x1
float bn43Variances[128]; // 1x128x1x1
float bn44Means[320]; // 1x320x1x1
float bn44Scales[320]; // 1x320x1x1
float bn44Shifts[320]; // 1x320x1x1
float bn44Variances[320]; // 1x320x1x1
float bn45Means[128]; // 1x128x1x1
float bn45Scales[128]; // 1x128x1x1
float bn45Shifts[128]; // 1x128x1x1
float bn45Variances[128]; // 1x128x1x1
float bn46Means[352]; // 1x352x1x1
float bn46Scales[352]; // 1x352x1x1
float bn46Shifts[352]; // 1x352x1x1
float bn46Variances[352]; // 1x352x1x1
float bn47Means[128]; // 1x128x1x1
float bn47Scales[128]; // 1x128x1x1
float bn47Shifts[128]; // 1x128x1x1
float bn47Variances[128]; // 1x128x1x1
float bn48Means[384]; // 1x384x1x1
float bn48Scales[384]; // 1x384x1x1
float bn48Shifts[384]; // 1x384x1x1
float bn48Variances[384]; // 1x384x1x1
float bn49Means[128]; // 1x128x1x1
float bn49Scales[128]; // 1x128x1x1
float bn49Shifts[128]; // 1x128x1x1
float bn49Variances[128]; // 1x128x1x1
float bn4Means[96]; // 1x96x1x1
float bn4Scales[96]; // 1x96x1x1
float bn4Shifts[96]; // 1x96x1x1
float bn4Variances[96]; // 1x96x1x1
float bn50Means[416]; // 1x416x1x1
float bn50Scales[416]; // 1x416x1x1
float bn50Shifts[416]; // 1x416x1x1
float bn50Variances[416]; // 1x416x1x1
float bn51Means[128]; // 1x128x1x1
float bn51Scales[128]; // 1x128x1x1
float bn51Shifts[128]; // 1x128x1x1
float bn51Variances[128]; // 1x128x1x1
float bn52Means[448]; // 1x448x1x1
float bn52Scales[448]; // 1x448x1x1
float bn52Shifts[448]; // 1x448x1x1
float bn52Variances[448]; // 1x448x1x1
float bn53Means[128]; // 1x128x1x1
float bn53Scales[128]; // 1x128x1x1
float bn53Shifts[128]; // 1x128x1x1
float bn53Variances[128]; // 1x128x1x1
float bn54Means[480]; // 1x480x1x1
float bn54Scales[480]; // 1x480x1x1
float bn54Shifts[480]; // 1x480x1x1
float bn54Variances[480]; // 1x480x1x1
float bn55Means[128]; // 1x128x1x1
float bn55Scales[128]; // 1x128x1x1
float bn55Shifts[128]; // 1x128x1x1
float bn55Variances[128]; // 1x128x1x1
float bn56Means[512]; // 1x512x1x1
float bn56Scales[512]; // 1x512x1x1
float bn56Shifts[512]; // 1x512x1x1
float bn56Variances[512]; // 1x512x1x1
float bn57Means[128]; // 1x128x1x1
float bn57Scales[128]; // 1x128x1x1
float bn57Shifts[128]; // 1x128x1x1
float bn57Variances[128]; // 1x128x1x1
float bn58Means[544]; // 1x544x1x1
float bn58Scales[544]; // 1x544x1x1
float bn58Shifts[544]; // 1x544x1x1
float bn58Variances[544]; // 1x544x1x1
float bn59Means[128]; // 1x128x1x1
float bn59Scales[128]; // 1x128x1x1
float bn59Shifts[128]; // 1x128x1x1
float bn59Variances[128]; // 1x128x1x1
float bn5Means[128]; // 1x128x1x1
float bn5Scales[128]; // 1x128x1x1
float bn5Shifts[128]; // 1x128x1x1
float bn5Variances[128]; // 1x128x1x1
float bn60Means[576]; // 1x576x1x1
float bn60Scales[576]; // 1x576x1x1
float bn60Shifts[576]; // 1x576x1x1
float bn60Variances[576]; // 1x576x1x1
float bn61Means[128]; // 1x128x1x1
float bn61Scales[128]; // 1x128x1x1
float bn61Shifts[128]; // 1x128x1x1
float bn61Variances[128]; // 1x128x1x1
float bn62Means[608]; // 1x608x1x1
float bn62Scales[608]; // 1x608x1x1
float bn62Shifts[608]; // 1x608x1x1
float bn62Variances[608]; // 1x608x1x1
float bn63Means[128]; // 1x128x1x1
float bn63Scales[128]; // 1x128x1x1
float bn63Shifts[128]; // 1x128x1x1
float bn63Variances[128]; // 1x128x1x1
float bn64Means[640]; // 1x640x1x1
float bn64Scales[640]; // 1x640x1x1
float bn64Shifts[640]; // 1x640x1x1
float bn64Variances[640]; // 1x640x1x1
float bn65Means[128]; // 1x128x1x1
float bn65Scales[128]; // 1x128x1x1
float bn65Shifts[128]; // 1x128x1x1
float bn65Variances[128]; // 1x128x1x1
float bn66Means[672]; // 1x672x1x1
float bn66Scales[672]; // 1x672x1x1
float bn66Shifts[672]; // 1x672x1x1
float bn66Variances[672]; // 1x672x1x1
float bn67Means[128]; // 1x128x1x1
float bn67Scales[128]; // 1x128x1x1
float bn67Shifts[128]; // 1x128x1x1
float bn67Variances[128]; // 1x128x1x1
float bn68Means[704]; // 1x704x1x1
float bn68Scales[704]; // 1x704x1x1
float bn68Shifts[704]; // 1x704x1x1
float bn68Variances[704]; // 1x704x1x1
float bn69Means[128]; // 1x128x1x1
float bn69Scales[128]; // 1x128x1x1
float bn69Shifts[128]; // 1x128x1x1
float bn69Variances[128]; // 1x128x1x1
float bn6Means[128]; // 1x128x1x1
float bn6Scales[128]; // 1x128x1x1
float bn6Shifts[128]; // 1x128x1x1
float bn6Variances[128]; // 1x128x1x1
float bn70Means[736]; // 1x736x1x1
float bn70Scales[736]; // 1x736x1x1
float bn70Shifts[736]; // 1x736x1x1
float bn70Variances[736]; // 1x736x1x1
float bn71Means[128]; // 1x128x1x1
float bn71Scales[128]; // 1x128x1x1
float bn71Shifts[128]; // 1x128x1x1
float bn71Variances[128]; // 1x128x1x1
float bn72Means[768]; // 1x768x1x1
float bn72Scales[768]; // 1x768x1x1
float bn72Shifts[768]; // 1x768x1x1
float bn72Variances[768]; // 1x768x1x1
float bn73Means[128]; // 1x128x1x1
float bn73Scales[128]; // 1x128x1x1
float bn73Shifts[128]; // 1x128x1x1
float bn73Variances[128]; // 1x128x1x1
float bn74Means[800]; // 1x800x1x1
float bn74Scales[800]; // 1x800x1x1
float bn74Shifts[800]; // 1x800x1x1
float bn74Variances[800]; // 1x800x1x1
float bn75Means[128]; // 1x128x1x1
float bn75Scales[128]; // 1x128x1x1
float bn75Shifts[128]; // 1x128x1x1
float bn75Variances[128]; // 1x128x1x1
float bn76Means[832]; // 1x832x1x1
float bn76Scales[832]; // 1x832x1x1
float bn76Shifts[832]; // 1x832x1x1
float bn76Variances[832]; // 1x832x1x1
float bn77Means[128]; // 1x128x1x1
float bn77Scales[128]; // 1x128x1x1
float bn77Shifts[128]; // 1x128x1x1
float bn77Variances[128]; // 1x128x1x1
float bn78Means[864]; // 1x864x1x1
float bn78Scales[864]; // 1x864x1x1
float bn78Shifts[864]; // 1x864x1x1
float bn78Variances[864]; // 1x864x1x1
float bn79Means[128]; // 1x128x1x1
float bn79Scales[128]; // 1x128x1x1
float bn79Shifts[128]; // 1x128x1x1
float bn79Variances[128]; // 1x128x1x1
float bn7Means[128]; // 1x128x1x1
float bn7Scales[128]; // 1x128x1x1
float bn7Shifts[128]; // 1x128x1x1
float bn7Variances[128]; // 1x128x1x1
float bn80Means[896]; // 1x896x1x1
float bn80Scales[896]; // 1x896x1x1
float bn80Shifts[896]; // 1x896x1x1
float bn80Variances[896]; // 1x896x1x1
float bn81Means[128]; // 1x128x1x1
float bn81Scales[128]; // 1x128x1x1
float bn81Shifts[128]; // 1x128x1x1
float bn81Variances[128]; // 1x128x1x1
float bn82Means[928]; // 1x928x1x1
float bn82Scales[928]; // 1x928x1x1
float bn82Shifts[928]; // 1x928x1x1
float bn82Variances[928]; // 1x928x1x1
float bn83Means[128]; // 1x128x1x1
float bn83Scales[128]; // 1x128x1x1
float bn83Shifts[128]; // 1x128x1x1
float bn83Variances[128]; // 1x128x1x1
float bn84Means[960]; // 1x960x1x1
float bn84Scales[960]; // 1x960x1x1
float bn84Shifts[960]; // 1x960x1x1
float bn84Variances[960]; // 1x960x1x1
float bn85Means[128]; // 1x128x1x1
float bn85Scales[128]; // 1x128x1x1
float bn85Shifts[128]; // 1x128x1x1
float bn85Variances[128]; // 1x128x1x1
float bn86Means[992]; // 1x992x1x1
float bn86Scales[992]; // 1x992x1x1
float bn86Shifts[992]; // 1x992x1x1
float bn86Variances[992]; // 1x992x1x1
float bn87Means[128]; // 1x128x1x1
float bn87Scales[128]; // 1x128x1x1
float bn87Shifts[128]; // 1x128x1x1
float bn87Variances[128]; // 1x128x1x1
float bn88Means[1024]; // 1x1024x1x1
float bn88Scales[1024]; // 1x1024x1x1
float bn88Shifts[1024]; // 1x1024x1x1
float bn88Variances[1024]; // 1x1024x1x1
float bn89Means[512]; // 1x512x1x1
float bn89Scales[512]; // 1x512x1x1
float bn89Shifts[512]; // 1x512x1x1
float bn89Variances[512]; // 1x512x1x1
float bn8Means[160]; // 1x160x1x1
float bn8Scales[160]; // 1x160x1x1
float bn8Shifts[160]; // 1x160x1x1
float bn8Variances[160]; // 1x160x1x1
float bn90Means[128]; // 1x128x1x1
float bn90Scales[128]; // 1x128x1x1
float bn90Shifts[128]; // 1x128x1x1
float bn90Variances[128]; // 1x128x1x1
float bn91Means[544]; // 1x544x1x1
float bn91Scales[544]; // 1x544x1x1
float bn91Shifts[544]; // 1x544x1x1
float bn91Variances[544]; // 1x544x1x1
float bn92Means[128]; // 1x128x1x1
float bn92Scales[128]; // 1x128x1x1
float bn92Shifts[128]; // 1x128x1x1
float bn92Variances[128]; // 1x128x1x1
float bn93Means[576]; // 1x576x1x1
float bn93Scales[576]; // 1x576x1x1
float bn93Shifts[576]; // 1x576x1x1
float bn93Variances[576]; // 1x576x1x1
float bn94Means[128]; // 1x128x1x1
float bn94Scales[128]; // 1x128x1x1
float bn94Shifts[128]; // 1x128x1x1
float bn94Variances[128]; // 1x128x1x1
float bn95Means[608]; // 1x608x1x1
float bn95Scales[608]; // 1x608x1x1
float bn95Shifts[608]; // 1x608x1x1
float bn95Variances[608]; // 1x608x1x1
float bn96Means[128]; // 1x128x1x1
float bn96Scales[128]; // 1x128x1x1
float bn96Shifts[128]; // 1x128x1x1
float bn96Variances[128]; // 1x128x1x1
float bn97Means[640]; // 1x640x1x1
float bn97Scales[640]; // 1x640x1x1
float bn97Shifts[640]; // 1x640x1x1
float bn97Variances[640]; // 1x640x1x1
float bn98Means[128]; // 1x128x1x1
float bn98Scales[128]; // 1x128x1x1
float bn98Shifts[128]; // 1x128x1x1
float bn98Variances[128]; // 1x128x1x1
float bn99Means[672]; // 1x672x1x1
float bn99Scales[672]; // 1x672x1x1
float bn99Shifts[672]; // 1x672x1x1
float bn99Variances[672]; // 1x672x1x1
float bn9Means[128]; // 1x128x1x1
float bn9Scales[128]; // 1x128x1x1
float bn9Shifts[128]; // 1x128x1x1
float bn9Variances[128]; // 1x128x1x1
float fcBiases[1000]; // 1x1000x1x1
float fcWeights[1024000]; // 1000x1024x1x1
float one10Biases[128]; // 1x128x1x1
float one10Weights[24576]; // 128x192x1x1
float one11Biases[128]; // 1x128x1x1
float one11Weights[28672]; // 128x224x1x1
float one12Biases[128]; // 1x128x1x1
float one12Weights[32768]; // 128x256x1x1
float one13Biases[128]; // 1x128x1x1
float one13Weights[36864]; // 128x288x1x1
float one14Biases[128]; // 1x128x1x1
float one14Weights[40960]; // 128x320x1x1
float one15Biases[128]; // 1x128x1x1
float one15Weights[45056]; // 128x352x1x1
float one16Biases[128]; // 1x128x1x1
float one16Weights[49152]; // 128x384x1x1
float one17Biases[128]; // 1x128x1x1
float one17Weights[53248]; // 128x416x1x1
float one18Biases[128]; // 1x128x1x1
float one18Weights[57344]; // 128x448x1x1
float one19Biases[128]; // 1x128x1x1
float one19Weights[61440]; // 128x480x1x1
float one1Biases[128]; // 1x128x1x1
float one1Weights[8192]; // 128x64x1x1
float one20Biases[256]; // 1x256x1x1
float one20Weights[131072]; // 256x512x1x1
float one21Biases[128]; // 1x128x1x1
float one21Weights[32768]; // 128x256x1x1
float one22Biases[128]; // 1x128x1x1
float one22Weights[36864]; // 128x288x1x1
float one23Biases[128]; // 1x128x1x1
float one23Weights[40960]; // 128x320x1x1
float one24Biases[128]; // 1x128x1x1
float one24Weights[45056]; // 128x352x1x1
float one25Biases[128]; // 1x128x1x1
float one25Weights[49152]; // 128x384x1x1
float one26Biases[128]; // 1x128x1x1
float one26Weights[53248]; // 128x416x1x1
float one27Biases[128]; // 1x128x1x1
float one27Weights[57344]; // 128x448x1x1
float one28Biases[128]; // 1x128x1x1
float one28Weights[61440]; // 128x480x1x1
float one29Biases[128]; // 1x128x1x1
float one29Weights[65536]; // 128x512x1x1
float one2Biases[128]; // 1x128x1x1
float one2Weights[12288]; // 128x96x1x1
float one30Biases[128]; // 1x128x1x1
float one30Weights[69632]; // 128x544x1x1
float one31Biases[128]; // 1x128x1x1
float one31Weights[73728]; // 128x576x1x1
float one32Biases[128]; // 1x128x1x1
float one32Weights[77824]; // 128x608x1x1
float one33Biases[128]; // 1x128x1x1
float one33Weights[81920]; // 128x640x1x1
float one34Biases[128]; // 1x128x1x1
float one34Weights[86016]; // 128x672x1x1
float one35Biases[128]; // 1x128x1x1
float one35Weights[90112]; // 128x704x1x1
float one36Biases[128]; // 1x128x1x1
float one36Weights[94208]; // 128x736x1x1
float one37Biases[128]; // 1x128x1x1
float one37Weights[98304]; // 128x768x1x1
float one38Biases[128]; // 1x128x1x1
float one38Weights[102400]; // 128x800x1x1
float one39Biases[128]; // 1x128x1x1
float one39Weights[106496]; // 128x832x1x1
float one3Biases[128]; // 1x128x1x1
float one3Weights[16384]; // 128x128x1x1
float one40Biases[128]; // 1x128x1x1
float one40Weights[110592]; // 128x864x1x1
float one41Biases[128]; // 1x128x1x1
float one41Weights[114688]; // 128x896x1x1
float one42Biases[128]; // 1x128x1x1
float one42Weights[118784]; // 128x928x1x1
float one43Biases[128]; // 1x128x1x1
float one43Weights[122880]; // 128x960x1x1
float one44Biases[128]; // 1x128x1x1
float one44Weights[126976]; // 128x992x1x1
float one45Biases[512]; // 1x512x1x1
float one45Weights[524288]; // 512x1024x1x1
float one46Biases[128]; // 1x128x1x1
float one46Weights[65536]; // 128x512x1x1
float one47Biases[128]; // 1x128x1x1
float one47Weights[69632]; // 128x544x1x1
float one48Biases[128]; // 1x128x1x1
float one48Weights[73728]; // 128x576x1x1
float one49Biases[128]; // 1x128x1x1
float one49Weights[77824]; // 128x608x1x1
float one4Biases[128]; // 1x128x1x1
float one4Weights[20480]; // 128x160x1x1
float one50Biases[128]; // 1x128x1x1
float one50Weights[81920]; // 128x640x1x1
float one51Biases[128]; // 1x128x1x1
float one51Weights[86016]; // 128x672x1x1
float one52Biases[128]; // 1x128x1x1
float one52Weights[90112]; // 128x704x1x1
float one53Biases[128]; // 1x128x1x1
float one53Weights[94208]; // 128x736x1x1
float one54Biases[128]; // 1x128x1x1
float one54Weights[98304]; // 128x768x1x1
float one55Biases[128]; // 1x128x1x1
float one55Weights[102400]; // 128x800x1x1
float one56Biases[128]; // 1x128x1x1
float one56Weights[106496]; // 128x832x1x1
float one57Biases[128]; // 1x128x1x1
float one57Weights[110592]; // 128x864x1x1
float one58Biases[128]; // 1x128x1x1
float one58Weights[114688]; // 128x896x1x1
float one59Biases[128]; // 1x128x1x1
float one59Weights[118784]; // 128x928x1x1
float one5Biases[128]; // 1x128x1x1
float one5Weights[24576]; // 128x192x1x1
float one60Biases[128]; // 1x128x1x1
float one60Weights[122880]; // 128x960x1x1
float one61Biases[128]; // 1x128x1x1
float one61Weights[126976]; // 128x992x1x1
float one6Biases[128]; // 1x128x1x1
float one6Weights[28672]; // 128x224x1x1
float one7Biases[128]; // 1x128x1x1
float one7Weights[32768]; // 128x256x1x1
float one8Biases[128]; // 1x128x1x1
float one8Weights[16384]; // 128x128x1x1
float one9Biases[128]; // 1x128x1x1
float one9Weights[20480]; // 128x160x1x1
float sevenDSBiases[64]; // 1x64x1x1
float sevenDSWeights[9408]; // 64x3x7x7
float three10Biases[32]; // 1x32x1x1
float three10Weights[36864]; // 32x128x3x3
float three11Biases[32]; // 1x32x1x1
float three11Weights[36864]; // 32x128x3x3
float three12Biases[32]; // 1x32x1x1
float three12Weights[36864]; // 32x128x3x3
float three13Biases[32]; // 1x32x1x1
float three13Weights[36864]; // 32x128x3x3
float three14Biases[32]; // 1x32x1x1
float three14Weights[36864]; // 32x128x3x3
float three15Biases[32]; // 1x32x1x1
float three15Weights[36864]; // 32x128x3x3
float three16Biases[32]; // 1x32x1x1
float three16Weights[36864]; // 32x128x3x3
float three17Biases[32]; // 1x32x1x1
float three17Weights[36864]; // 32x128x3x3
float three18Biases[32]; // 1x32x1x1
float three18Weights[36864]; // 32x128x3x3
float three19Biases[32]; // 1x32x1x1
float three19Weights[36864]; // 32x128x3x3
float three1Biases[32]; // 1x32x1x1
float three1Weights[36864]; // 32x128x3x3
float three20Biases[32]; // 1x32x1x1
float three20Weights[36864]; // 32x128x3x3
float three21Biases[32]; // 1x32x1x1
float three21Weights[36864]; // 32x128x3x3
float three22Biases[32]; // 1x32x1x1
float three22Weights[36864]; // 32x128x3x3
float three23Biases[32]; // 1x32x1x1
float three23Weights[36864]; // 32x128x3x3
float three24Biases[32]; // 1x32x1x1
float three24Weights[36864]; // 32x128x3x3
float three25Biases[32]; // 1x32x1x1
float three25Weights[36864]; // 32x128x3x3
float three26Biases[32]; // 1x32x1x1
float three26Weights[36864]; // 32x128x3x3
float three27Biases[32]; // 1x32x1x1
float three27Weights[36864]; // 32x128x3x3
float three28Biases[32]; // 1x32x1x1
float three28Weights[36864]; // 32x128x3x3
float three29Biases[32]; // 1x32x1x1
float three29Weights[36864]; // 32x128x3x3
float three2Biases[32]; // 1x32x1x1
float three2Weights[36864]; // 32x128x3x3
float three30Biases[32]; // 1x32x1x1
float three30Weights[36864]; // 32x128x3x3
float three31Biases[32]; // 1x32x1x1
float three31Weights[36864]; // 32x128x3x3
float three32Biases[32]; // 1x32x1x1
float three32Weights[36864]; // 32x128x3x3
float three33Biases[32]; // 1x32x1x1
float three33Weights[36864]; // 32x128x3x3
float three34Biases[32]; // 1x32x1x1
float three34Weights[36864]; // 32x128x3x3
float three35Biases[32]; // 1x32x1x1
float three35Weights[36864]; // 32x128x3x3
float three36Biases[32]; // 1x32x1x1
float three36Weights[36864]; // 32x128x3x3
float three37Biases[32]; // 1x32x1x1
float three37Weights[36864]; // 32x128x3x3
float three38Biases[32]; // 1x32x1x1
float three38Weights[36864]; // 32x128x3x3
float three39Biases[32]; // 1x32x1x1
float three39Weights[36864]; // 32x128x3x3
float three3Biases[32]; // 1x32x1x1
float three3Weights[36864]; // 32x128x3x3
float three40Biases[32]; // 1x32x1x1
float three40Weights[36864]; // 32x128x3x3
float three41Biases[32]; // 1x32x1x1
float three41Weights[36864]; // 32x128x3x3
float three42Biases[32]; // 1x32x1x1
float three42Weights[36864]; // 32x128x3x3
float three43Biases[32]; // 1x32x1x1
float three43Weights[36864]; // 32x128x3x3
float three44Biases[32]; // 1x32x1x1
float three44Weights[36864]; // 32x128x3x3
float three45Biases[32]; // 1x32x1x1
float three45Weights[36864]; // 32x128x3x3
float three46Biases[32]; // 1x32x1x1
float three46Weights[36864]; // 32x128x3x3
float three47Biases[32]; // 1x32x1x1
float three47Weights[36864]; // 32x128x3x3
float three48Biases[32]; // 1x32x1x1
float three48Weights[36864]; // 32x128x3x3
float three49Biases[32]; // 1x32x1x1
float three49Weights[36864]; // 32x128x3x3
float three4Biases[32]; // 1x32x1x1
float three4Weights[36864]; // 32x128x3x3
float three50Biases[32]; // 1x32x1x1
float three50Weights[36864]; // 32x128x3x3
float three51Biases[32]; // 1x32x1x1
float three51Weights[36864]; // 32x128x3x3
float three52Biases[32]; // 1x32x1x1
float three52Weights[36864]; // 32x128x3x3
float three53Biases[32]; // 1x32x1x1
float three53Weights[36864]; // 32x128x3x3
float three54Biases[32]; // 1x32x1x1
float three54Weights[36864]; // 32x128x3x3
float three55Biases[32]; // 1x32x1x1
float three55Weights[36864]; // 32x128x3x3
float three56Biases[32]; // 1x32x1x1
float three56Weights[36864]; // 32x128x3x3
float three57Biases[32]; // 1x32x1x1
float three57Weights[36864]; // 32x128x3x3
float three58Biases[32]; // 1x32x1x1
float three58Weights[36864]; // 32x128x3x3
float three5Biases[32]; // 1x32x1x1
float three5Weights[36864]; // 32x128x3x3
float three6Biases[32]; // 1x32x1x1
float three6Weights[36864]; // 32x128x3x3
float three7Biases[32]; // 1x32x1x1
float three7Weights[36864]; // 32x128x3x3
float three8Biases[32]; // 1x32x1x1
float three8Weights[36864]; // 32x128x3x3
float three9Biases[32]; // 1x32x1x1
float three9Weights[36864]; // 32x128x3x3
} __attribute__((packed));

#ifdef __cplusplus
/**/ }
#endif

// End of file.

Top || Output DenseNet121.c file

// To build an object file:
// gcc -c -w -std=c99 -pthread -Ofast -mavx512f DenseNet121.c

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <errno.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <immintrin.h>

#include "DenseNet121.h"

static char* DenseNet121Errmsg1(ptrdiff_t lineNum1, char* format1, ...) {
char* msg1 = malloc(279);
int step1 = sprintf(msg1, "DenseNet121: line %td: ", lineNum1);
va_list ap1;
va_start(ap1, format1);
vsnprintf(msg1+step1, 279-step1, format1, ap1);
va_end(ap1);
return msg1;
}

typedef struct DenseNet121ThreaderTask1 DenseNet121ThreaderTask1;
typedef void (*DenseNet121ThreaderCallee1)(DenseNet121ThreaderTask1*, int64_t*);
typedef struct DenseNet121ThreaderHub1 DenseNet121ThreaderHub1;
typedef struct DenseNet121ThreaderNode1 DenseNet121ThreaderNode1;
typedef struct DenseNet121ThreaderUnwind1 DenseNet121ThreaderUnwind1;
typedef struct DenseNet121ThreaderTeam1 DenseNet121ThreaderTeam1;

struct DenseNet121ThreaderTask1 {
DenseNet121ThreaderCallee1 callee1;
void* any1;
ptrdiff_t nd1;
int64_t hull1[4];
};

struct DenseNet121ThreaderHub1 {
pthread_mutex_t mut1;
pthread_cond_t cond1;
ptrdiff_t pending1;
ptrdiff_t offset1;
long mask1;
long status1[];
};

struct DenseNet121ThreaderNode1 {
pthread_mutex_t mut2;
int64_t np1;
int64_t pt1[4];
DenseNet121ThreaderTask1* task1;
pthread_cond_t cond2;
DenseNet121ThreaderTeam1* team1;
pthread_t thr1;
} __attribute__((aligned(64)));

struct DenseNet121ThreaderUnwind1 {
ptrdiff_t join1;
ptrdiff_t nodeConds1;
ptrdiff_t nodeMuts1;
ptrdiff_t hubCond1;
ptrdiff_t hubMut1;
void* nodes1;
void* hub1;
};

struct DenseNet121ThreaderTeam1 {
ptrdiff_t nt1;
DenseNet121ThreaderHub1* hub2;
DenseNet121ThreaderNode1* nodes2;
DenseNet121ThreaderUnwind1 unwind1;
};

static void DenseNet121ThreaderInc1(
ptrdiff_t nd2,
int64_t*restrict hull2,
int64_t*restrict pt2
) {
for (ptrdiff_t i1 = 0; i1 < nd2; ++i1) {
int64_t elem1 = pt2[i1];
if (++elem1 == hull2[i1]) {
pt2[i1] = 0;
} else {
pt2[i1] = elem1;
break;
}
}
}

static void DenseNet121ThreaderPut1(
ptrdiff_t nd3,
int64_t*restrict hull3,
int64_t*restrict pt3,
int64_t val1
) {
ptrdiff_t i2 = 0;
for (; i2 < nd3 && val1; ) {
int64_t wrap1 = hull3[i2];
int64_t carry1 = val1/wrap1;
pt3[i2++] = val1-carry1*wrap1;
val1 = carry1;
}
for (; i2 < nd3; pt3[i2++] = 0);
}

static void DenseNet121ThreaderAdd1(
ptrdiff_t nd4,
int64_t*restrict hull4,
int64_t*restrict pt4,
int64_t*restrict plus1,
int64_t carry2
) {
for (ptrdiff_t i3 = 0; i3 < nd4; ++i3) {
int64_t wrap2 = hull4[i3];
int64_t sum1 = pt4[i3]+plus1[i3]+carry2;
if (sum1 < wrap2) {
pt4[i3] = sum1;
carry2 = 0;
} else {
pt4[i3] = sum1-wrap2;
carry2 = 1;
}
}
}

static void* DenseNet121ThreaderMain1(void* arg1) {
DenseNet121ThreaderNode1* node1 = arg1;
DenseNet121ThreaderTeam1* team2 = node1->team1;
ptrdiff_t nt2 = team2->nt1;
DenseNet121ThreaderHub1* hub3 = team2->hub2;
DenseNet121ThreaderNode1* nodes3 = team2->nodes2;
size_t role1 = node1-nodes3;
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
for (; ; ) {
DenseNet121ThreaderTask1* task2 = node1->task1;
if (!task2) {
for (; __builtin_expect(pthread_cond_wait(&node1->cond2, &node1->mut2), 0); );
continue;
}
int64_t np2 = node1->np1;
if (np2 < 0) {
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
return 0;
}
node1->task1 = 0;
DenseNet121ThreaderCallee1 callee2 = task2->callee1;
ptrdiff_t nd5 = task2->nd1;
int64_t pt5[4];
for (; np2; np2 = node1->np1) {
memcpy(pt5, node1->pt1, sizeof(pt5));
node1->np1 = np2-1;
DenseNet121ThreaderInc1(nd5, task2->hull1, node1->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[role1/(sizeof(long)*8)] &= ~((long)1<<role1%(sizeof(long)*8));
ptrdiff_t offset2 = hub3->offset1;
long mask2 = hub3->mask1;
ptrdiff_t wrapped1 = 0;
for (; ; ) {
long hand1 = hub3->status1[offset2]&mask2;
if (!hand1) {
++offset2;
mask2 = -1;
continue;
}
ptrdiff_t target1 = offset2*(sizeof(long)*8)+__builtin_ctzl(hand1);
if (target1 == nt2) {
if (wrapped1) break;
offset2 = 0;
mask2 = -1;
wrapped1 = 1;
continue;
}
hand1 &= -hand1;
hub3->offset1 = offset2;
hub3->mask1 = mask2-hand1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
DenseNet121ThreaderNode1* node2 = nodes3+target1;
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
for (np2 = node2->np1; np2; np2 = node2->np1) {
memcpy(pt5, node2->pt1, sizeof(pt5));
node2->np1 = np2-1;
DenseNet121ThreaderInc1(nd5, task2->hull1, node2->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[offset2] &= ~hand1;
offset2 = hub3->offset1;
mask2 = hub3->mask1;
wrapped1 = 0;
}
ptrdiff_t pending2 = --hub3->pending1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
if (!pending2) for (; __builtin_expect(pthread_cond_signal(&hub3->cond1), 0); );
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
}

static void DenseNet121ThreaderDestroy1(DenseNet121ThreaderTeam1* team3) {
if (!team3) return;
DenseNet121ThreaderNode1* nodes4 = team3->nodes2;
DenseNet121ThreaderNode1* stop1 = nodes4+team3->unwind1.join1;
for (DenseNet121ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_lock(&node3->mut2), 0); );
node3->np1 = -1;
node3->task1 = (DenseNet121ThreaderTask1*)1;
for (; __builtin_expect(pthread_mutex_unlock(&node3->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node3->cond2), 0); );
}
for (DenseNet121ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_join(node3->thr1, 0), 0); );
}
stop1 = nodes4+team3->unwind1.nodeConds1;
for (DenseNet121ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_cond_destroy(&node3->cond2), 0); );
}
stop1 = nodes4+team3->unwind1.nodeMuts1;
for (DenseNet121ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_destroy(&node3->mut2), 0); );
}
DenseNet121ThreaderHub1* hub4 = team3->hub2;
if (team3->unwind1.hubCond1) {
for (; __builtin_expect(pthread_cond_destroy(&hub4->cond1), 0); );
}
if (team3->unwind1.hubMut1) {
for (; __builtin_expect(pthread_mutex_destroy(&hub4->mut1), 0); );
}
free(team3->unwind1.nodes1);
free(team3->unwind1.hub1);
free(team3);
}

static char* DenseNet121ThreaderCreate1Up4(DenseNet121ThreaderTeam1* team8, ptrdiff_t nt7) {
DenseNet121ThreaderNode1* nodes5 = team8->nodes2;
for (DenseNet121ThreaderNode1* node4 = nodes5; node4 != nodes5+nt7; ++node4) {
int err2 = pthread_mutex_init(&node4->mut2, 0);
if (__builtin_expect(err2, 0)) {
char* msg2 = DenseNet121Errmsg1(__LINE__, "errno %d", err2);
team8->unwind1.nodeMuts1 = node4-nodes5;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg2;
}
node4->task1 = 0;
int err3 = pthread_cond_init(&node4->cond2, 0);
if (__builtin_expect(err3, 0)) {
char* msg3 = DenseNet121Errmsg1(__LINE__, "errno %d", err3);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg3;
}
node4->team1 = team8;
int err4 = pthread_create(&node4->thr1, 0, DenseNet121ThreaderMain1, node4);
if (__builtin_expect(err4, 0)) {
char* msg4 = DenseNet121Errmsg1(__LINE__, "errno %d", err4);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5+1;
team8->unwind1.join1 = node4-nodes5;
return msg4;
}
}
team8->unwind1.nodeMuts1 = nt7;
team8->unwind1.nodeConds1 = nt7;
team8->unwind1.join1 = nt7;
return 0;
}

static char* DenseNet121ThreaderCreate1Up3(DenseNet121ThreaderTeam1* team7, ptrdiff_t nt6) {
DenseNet121ThreaderHub1* hub5 = team7->hub2;
int err5 = pthread_mutex_init(&hub5->mut1, 0);
if (__builtin_expect(err5, 0)) {
return DenseNet121Errmsg1(__LINE__, "errno %d", err5);
}
team7->unwind1.hubMut1 = 1;
int err6 = pthread_cond_init(&hub5->cond1, 0);
if (__builtin_expect(err6, 0)) {
return DenseNet121Errmsg1(__LINE__, "errno %d", err6);
}
team7->unwind1.hubCond1 = 1;
return DenseNet121ThreaderCreate1Up4(team7, nt6);
}

static char* DenseNet121ThreaderCreate1Up2(DenseNet121ThreaderTeam1* team6, ptrdiff_t nt5) {
size_t size2 = nt5*sizeof(DenseNet121ThreaderNode1);
if (__builtin_expect(size2/sizeof(DenseNet121ThreaderNode1) != (size_t)nt5, 0)) {
return DenseNet121Errmsg1(__LINE__, "too many threads");
}
void* addr3 = malloc(size2+63);
if (__builtin_expect(!addr3, 0)) {
return DenseNet121Errmsg1(__LINE__, "errno %d", errno);
}
team6->unwind1.nodes1 = addr3;
team6->nodes2 = (void*)(((size_t)addr3+63)&-64);
return DenseNet121ThreaderCreate1Up3(team6, nt5);
}

static char* DenseNet121ThreaderCreate1Up1(DenseNet121ThreaderTeam1* team5, ptrdiff_t nt4) {
team5->nt1 = nt4;
size_t size1 = sizeof(DenseNet121ThreaderHub1);
size1 += sizeof(long)*((size_t)nt4/(sizeof(long)*8)+1);
size1 = (size1+63)&-64;
void* addr2 = malloc(size1+63);
if (__builtin_expect(!addr2, 0)) {
return DenseNet121Errmsg1(__LINE__, "errno %d", errno);
}
team5->unwind1.hub1 = addr2;
team5->hub2 = (void*)(((size_t)addr2+63)&-64);
return DenseNet121ThreaderCreate1Up2(team5, nt4);
}

static char* DenseNet121ThreaderCreate1(DenseNet121ThreaderTeam1** team4, ptrdiff_t nt3) {
if (__builtin_expect(nt3 < 1, 0)) {
return DenseNet121Errmsg1(__LINE__, "too few threads");
}
void* addr1 = calloc(1, sizeof(DenseNet121ThreaderTeam1));
if (__builtin_expect(!addr1, 0)) {
return DenseNet121Errmsg1(__LINE__, "errno %d", errno);
}
char* err1 = DenseNet121ThreaderCreate1Up1(addr1, nt3);
if (__builtin_expect(!!err1, 0)) {
DenseNet121ThreaderDestroy1(addr1);
} else {
*team4 = addr1;
}
return err1;
}

static char* DenseNet121ThreaderPthreadT1(
pthread_t* thr2,
DenseNet121ThreaderTeam1* team9,
ptrdiff_t idx1
) {
if (__builtin_expect(idx1 < 0 || idx1 >= team9->nt1, 0)) {
return DenseNet121Errmsg1(__LINE__, "bad thread idx");
}
*thr2 = team9->nodes2[idx1].thr1;
return 0;
}

static void DenseNet121ThreaderDo1(DenseNet121ThreaderTeam1* team10, DenseNet121ThreaderTask1* task3) {
ptrdiff_t nd6 = task3->nd1;
if (nd6 < 1) return;
int64_t tot1 = task3->hull1[0];
for (ptrdiff_t i4 = 1; i4 < nd6; tot1 *= task3->hull1[i4++]);
ptrdiff_t nt8 = team10->nt1;
int64_t each1 = tot1/nt8;
ptrdiff_t more1 = tot1%nt8;
int64_t plus2[4];
DenseNet121ThreaderPut1(nd6, task3->hull1, plus2, each1);
int64_t pt6[4] = {0};
DenseNet121ThreaderHub1* hub6 = team10->hub2;
for (; __builtin_expect(pthread_mutex_lock(&hub6->mut1), 0); );
DenseNet121ThreaderNode1* node5 = team10->nodes2;
for (ptrdiff_t i4 = 0; ; ++node5) {
for (; __builtin_expect(pthread_mutex_lock(&node5->mut2), 0); );
int64_t carry3 = i4 < more1;
node5->np1 = each1+carry3;
memcpy(node5->pt1, pt6, sizeof(pt6));
node5->task1 = task3;
for (; __builtin_expect(pthread_mutex_unlock(&node5->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node5->cond2), 0); );
if (++i4 == nt8) break;
DenseNet121ThreaderAdd1(nd6, task3->hull1, pt6, plus2, carry3);
}
hub6->offset1 = 0;
hub6->mask1 = -1;
for (ptrdiff_t i4 = (size_t)nt8/(sizeof(long)*8); i4 >= 0; ) {
hub6->status1[i4--] = -1;
}
for (hub6->pending1 = nt8; hub6->pending1; ) {
for (; __builtin_expect(pthread_cond_wait(&hub6->cond1, &hub6->mut1), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&hub6->mut1), 0); );
}

static __m512 DenseNet121Exp1(__m512 x1) {
x1 = _mm512_max_ps(x1, _mm512_set1_ps(-8.733654e+01f));
x1 = _mm512_min_ps(x1, _mm512_set1_ps(8.872284e+01f));
__m512 t1 = _mm512_mul_ps(x1, _mm512_set1_ps(1.442695e+00f));
__m512 r1 = _mm512_roundscale_ps(t1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512 f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-6.9314575e-01f), x1);
f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-1.4286068e-06f), f1);
__m512 g1 = _mm512_set1_ps(4.194439e-02f);
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(1.6800667e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(4.9999994e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.999569e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.9999964e-01f));
__m512i y1 = _mm512_slli_epi32(_mm512_cvtps_epi32(t1), 23);
return _mm512_castsi512_ps(_mm512_add_epi32(y1, _mm512_castps_si512(g1)));
}

static void DenseNet121Softmax1(DenseNet121ThreaderTeam1* team224, char** tensors421) {
(void)team224;
char*restrict ptr12 = tensors421[0];
char*restrict ptr13 = tensors421[1];
__m512 max1 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*0);
__m512 max2 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*1);
__m512 max3 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*2);
__m512 max4 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*3);
__m512 max5 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*4);
__m512 max6 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*5);
__m512 max7 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*6);
__m512 max8 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*7);
__m512 max9 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*8);
__m512 max10 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*9);
__m512 max11 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*10);
__m512 max12 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*11);
__m512 max13 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*12);
__m512 max14 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*13);
__m512 max15 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*14);
__m512 max16 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*15);
for (ptrdiff_t i251 = 1; i251 <= 2; ++i251) {
__m512 dat2433 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*0+(ptrdiff_t)64*16*i251);
__m512 dat2434 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*1+(ptrdiff_t)64*16*i251);
__m512 dat2435 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*2+(ptrdiff_t)64*16*i251);
__m512 dat2436 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*3+(ptrdiff_t)64*16*i251);
__m512 dat2437 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*4+(ptrdiff_t)64*16*i251);
__m512 dat2438 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*5+(ptrdiff_t)64*16*i251);
__m512 dat2439 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*6+(ptrdiff_t)64*16*i251);
__m512 dat2440 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*7+(ptrdiff_t)64*16*i251);
__m512 dat2441 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*8+(ptrdiff_t)64*16*i251);
__m512 dat2442 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*9+(ptrdiff_t)64*16*i251);
__m512 dat2443 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*10+(ptrdiff_t)64*16*i251);
__m512 dat2444 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*11+(ptrdiff_t)64*16*i251);
__m512 dat2445 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*12+(ptrdiff_t)64*16*i251);
__m512 dat2446 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*13+(ptrdiff_t)64*16*i251);
__m512 dat2447 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*14+(ptrdiff_t)64*16*i251);
__m512 dat2448 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*15+(ptrdiff_t)64*16*i251);
max1 = _mm512_max_ps(max1, dat2433);
max2 = _mm512_max_ps(max2, dat2434);
max3 = _mm512_max_ps(max3, dat2435);
max4 = _mm512_max_ps(max4, dat2436);
max5 = _mm512_max_ps(max5, dat2437);
max6 = _mm512_max_ps(max6, dat2438);
max7 = _mm512_max_ps(max7, dat2439);
max8 = _mm512_max_ps(max8, dat2440);
max9 = _mm512_max_ps(max9, dat2441);
max10 = _mm512_max_ps(max10, dat2442);
max11 = _mm512_max_ps(max11, dat2443);
max12 = _mm512_max_ps(max12, dat2444);
max13 = _mm512_max_ps(max13, dat2445);
max14 = _mm512_max_ps(max14, dat2446);
max15 = _mm512_max_ps(max15, dat2447);
max16 = _mm512_max_ps(max16, dat2448);
}
__m512 dat2449 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*0+(ptrdiff_t)64*16*3);
__m512 dat2450 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*1+(ptrdiff_t)64*16*3);
__m512 dat2451 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*2+(ptrdiff_t)64*16*3);
__m512 dat2452 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*3+(ptrdiff_t)64*16*3);
__m512 dat2453 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*4+(ptrdiff_t)64*16*3);
__m512 dat2454 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*5+(ptrdiff_t)64*16*3);
__m512 dat2455 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*6+(ptrdiff_t)64*16*3);
__m512 dat2456 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*7+(ptrdiff_t)64*16*3);
__m512 dat2457 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*8+(ptrdiff_t)64*16*3);
__m512 dat2458 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*9+(ptrdiff_t)64*16*3);
__m512 dat2459 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*10+(ptrdiff_t)64*16*3);
__m512 dat2460 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*11+(ptrdiff_t)64*16*3);
__m512 dat2461 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*12+(ptrdiff_t)64*16*3);
__m512 dat2462 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*13+(ptrdiff_t)64*16*3);
max1 = _mm512_max_ps(max1, dat2449);
max2 = _mm512_max_ps(max2, dat2450);
max3 = _mm512_max_ps(max3, dat2451);
max4 = _mm512_max_ps(max4, dat2452);
max5 = _mm512_max_ps(max5, dat2453);
max6 = _mm512_max_ps(max6, dat2454);
max7 = _mm512_max_ps(max7, dat2455);
max8 = _mm512_max_ps(max8, dat2456);
max9 = _mm512_max_ps(max9, dat2457);
max10 = _mm512_max_ps(max10, dat2458);
max11 = _mm512_max_ps(max11, dat2459);
max12 = _mm512_max_ps(max12, dat2460);
max13 = _mm512_max_ps(max13, dat2461);
max14 = _mm512_max_ps(max14, dat2462);
__m512 dat2463 = _mm512_maskz_loadu_ps(255, ptr12+(ptrdiff_t)64*62);
max16 = _mm512_mask_max_ps(max16, 255, max16, dat2463);
max1 = _mm512_max_ps(max1, max9);
max2 = _mm512_max_ps(max2, max10);
max3 = _mm512_max_ps(max3, max11);
max4 = _mm512_max_ps(max4, max12);
max5 = _mm512_max_ps(max5, max13);
max6 = _mm512_max_ps(max6, max14);
max7 = _mm512_max_ps(max7, max15);
max8 = _mm512_max_ps(max8, max16);
max1 = _mm512_max_ps(max1, max5);
max2 = _mm512_max_ps(max2, max6);
max3 = _mm512_max_ps(max3, max7);
max4 = _mm512_max_ps(max4, max8);
max1 = _mm512_max_ps(max1, max3);
max2 = _mm512_max_ps(max2, max4);
max1 = _mm512_max_ps(max1, max2);
__m512i p2 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8);
max1 = _mm512_mask_max_ps(max1, 255, max1, _mm512_permutexvar_ps(p2, max1));
__m512i p3 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4);
max1 = _mm512_mask_max_ps(max1, 15, max1, _mm512_permutexvar_ps(p3, max1));
__m512i p4 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2);
max1 = _mm512_mask_max_ps(max1, 3, max1, _mm512_permutexvar_ps(p4, max1));
__m512i p5 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
max1 = _mm512_mask_max_ps(max1, 1, max1, _mm512_permutexvar_ps(p5, max1));
__m512i p6 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
max1 = _mm512_permutexvar_ps(p6, max1);
__m512 sum2595 = _mm512_setzero_ps();
__m512 neg1 = _mm512_sub_ps(sum2595, max1);
__m512 dat2494 = _mm512_maskz_loadu_ps(255, ptr12+(ptrdiff_t)64*14+(ptrdiff_t)64*16*3);
__m512 dat2493 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*13+(ptrdiff_t)64*16*3);
__m512 dat2492 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*12+(ptrdiff_t)64*16*3);
__m512 dat2491 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*11+(ptrdiff_t)64*16*3);
__m512 dat2490 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*10+(ptrdiff_t)64*16*3);
__m512 dat2489 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*9+(ptrdiff_t)64*16*3);
__m512 dat2488 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*8+(ptrdiff_t)64*16*3);
__m512 dat2487 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*7+(ptrdiff_t)64*16*3);
__m512 dat2486 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*6+(ptrdiff_t)64*16*3);
__m512 dat2485 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*5+(ptrdiff_t)64*16*3);
__m512 dat2484 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*4+(ptrdiff_t)64*16*3);
__m512 dat2483 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*3+(ptrdiff_t)64*16*3);
__m512 dat2482 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*2+(ptrdiff_t)64*16*3);
__m512 dat2481 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*1+(ptrdiff_t)64*16*3);
__m512 dat2480 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*0+(ptrdiff_t)64*16*3);
dat2494 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2494));
sum2595 = _mm512_mask_add_ps(sum2595, 255, sum2595, dat2494);
dat2493 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2493));
sum2595 = _mm512_add_ps(sum2595, dat2493);
dat2492 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2492));
sum2595 = _mm512_add_ps(sum2595, dat2492);
dat2491 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2491));
sum2595 = _mm512_add_ps(sum2595, dat2491);
dat2490 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2490));
sum2595 = _mm512_add_ps(sum2595, dat2490);
dat2489 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2489));
sum2595 = _mm512_add_ps(sum2595, dat2489);
dat2488 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2488));
sum2595 = _mm512_add_ps(sum2595, dat2488);
dat2487 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2487));
sum2595 = _mm512_add_ps(sum2595, dat2487);
dat2486 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2486));
sum2595 = _mm512_add_ps(sum2595, dat2486);
dat2485 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2485));
sum2595 = _mm512_add_ps(sum2595, dat2485);
dat2484 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2484));
sum2595 = _mm512_add_ps(sum2595, dat2484);
dat2483 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2483));
sum2595 = _mm512_add_ps(sum2595, dat2483);
dat2482 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2482));
sum2595 = _mm512_add_ps(sum2595, dat2482);
dat2481 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2481));
sum2595 = _mm512_add_ps(sum2595, dat2481);
dat2480 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2480));
sum2595 = _mm512_add_ps(sum2595, dat2480);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*14+(ptrdiff_t)64*16*3, 255, dat2494);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*13+(ptrdiff_t)64*16*3, 65535, dat2493);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*12+(ptrdiff_t)64*16*3, 65535, dat2492);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*11+(ptrdiff_t)64*16*3, 65535, dat2491);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*10+(ptrdiff_t)64*16*3, 65535, dat2490);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*9+(ptrdiff_t)64*16*3, 65535, dat2489);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*8+(ptrdiff_t)64*16*3, 65535, dat2488);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*7+(ptrdiff_t)64*16*3, 65535, dat2487);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*6+(ptrdiff_t)64*16*3, 65535, dat2486);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*5+(ptrdiff_t)64*16*3, 65535, dat2485);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*4+(ptrdiff_t)64*16*3, 65535, dat2484);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*3+(ptrdiff_t)64*16*3, 65535, dat2483);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*2+(ptrdiff_t)64*16*3, 65535, dat2482);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*1+(ptrdiff_t)64*16*3, 65535, dat2481);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*0+(ptrdiff_t)64*16*3, 65535, dat2480);
for (ptrdiff_t i252 = 2; i252 >= 0; --i252) {
__m512 dat2479 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*15+(ptrdiff_t)64*16*i252);
__m512 dat2478 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*14+(ptrdiff_t)64*16*i252);
__m512 dat2477 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*13+(ptrdiff_t)64*16*i252);
__m512 dat2476 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*12+(ptrdiff_t)64*16*i252);
__m512 dat2475 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*11+(ptrdiff_t)64*16*i252);
__m512 dat2474 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*10+(ptrdiff_t)64*16*i252);
__m512 dat2473 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*9+(ptrdiff_t)64*16*i252);
__m512 dat2472 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*8+(ptrdiff_t)64*16*i252);
__m512 dat2471 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*7+(ptrdiff_t)64*16*i252);
__m512 dat2470 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*6+(ptrdiff_t)64*16*i252);
__m512 dat2469 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*5+(ptrdiff_t)64*16*i252);
__m512 dat2468 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*4+(ptrdiff_t)64*16*i252);
__m512 dat2467 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*3+(ptrdiff_t)64*16*i252);
__m512 dat2466 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*2+(ptrdiff_t)64*16*i252);
__m512 dat2465 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*1+(ptrdiff_t)64*16*i252);
__m512 dat2464 = _mm512_maskz_loadu_ps(65535, ptr12+(ptrdiff_t)64*0+(ptrdiff_t)64*16*i252);
dat2479 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2479));
sum2595 = _mm512_add_ps(sum2595, dat2479);
dat2478 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2478));
sum2595 = _mm512_add_ps(sum2595, dat2478);
dat2477 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2477));
sum2595 = _mm512_add_ps(sum2595, dat2477);
dat2476 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2476));
sum2595 = _mm512_add_ps(sum2595, dat2476);
dat2475 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2475));
sum2595 = _mm512_add_ps(sum2595, dat2475);
dat2474 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2474));
sum2595 = _mm512_add_ps(sum2595, dat2474);
dat2473 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2473));
sum2595 = _mm512_add_ps(sum2595, dat2473);
dat2472 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2472));
sum2595 = _mm512_add_ps(sum2595, dat2472);
dat2471 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2471));
sum2595 = _mm512_add_ps(sum2595, dat2471);
dat2470 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2470));
sum2595 = _mm512_add_ps(sum2595, dat2470);
dat2469 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2469));
sum2595 = _mm512_add_ps(sum2595, dat2469);
dat2468 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2468));
sum2595 = _mm512_add_ps(sum2595, dat2468);
dat2467 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2467));
sum2595 = _mm512_add_ps(sum2595, dat2467);
dat2466 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2466));
sum2595 = _mm512_add_ps(sum2595, dat2466);
dat2465 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2465));
sum2595 = _mm512_add_ps(sum2595, dat2465);
dat2464 = DenseNet121Exp1(_mm512_add_ps(neg1, dat2464));
sum2595 = _mm512_add_ps(sum2595, dat2464);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*15+(ptrdiff_t)64*16*i252, 65535, dat2479);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*14+(ptrdiff_t)64*16*i252, 65535, dat2478);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*13+(ptrdiff_t)64*16*i252, 65535, dat2477);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*12+(ptrdiff_t)64*16*i252, 65535, dat2476);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*11+(ptrdiff_t)64*16*i252, 65535, dat2475);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*10+(ptrdiff_t)64*16*i252, 65535, dat2474);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*9+(ptrdiff_t)64*16*i252, 65535, dat2473);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*8+(ptrdiff_t)64*16*i252, 65535, dat2472);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*7+(ptrdiff_t)64*16*i252, 65535, dat2471);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*6+(ptrdiff_t)64*16*i252, 65535, dat2470);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*5+(ptrdiff_t)64*16*i252, 65535, dat2469);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*4+(ptrdiff_t)64*16*i252, 65535, dat2468);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*3+(ptrdiff_t)64*16*i252, 65535, dat2467);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*2+(ptrdiff_t)64*16*i252, 65535, dat2466);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*1+(ptrdiff_t)64*16*i252, 65535, dat2465);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*0+(ptrdiff_t)64*16*i252, 65535, dat2464);
}
__m512i p7 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8);
sum2595 = _mm512_mask_add_ps(sum2595, 255, sum2595, _mm512_permutexvar_ps(p7, sum2595));
__m512i p8 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4);
sum2595 = _mm512_mask_add_ps(sum2595, 15, sum2595, _mm512_permutexvar_ps(p8, sum2595));
__m512i p9 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2);
sum2595 = _mm512_mask_add_ps(sum2595, 3, sum2595, _mm512_permutexvar_ps(p9, sum2595));
__m512i p10 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
sum2595 = _mm512_mask_add_ps(sum2595, 1, sum2595, _mm512_permutexvar_ps(p10, sum2595));
__m512i p11 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
sum2595 = _mm512_permutexvar_ps(p11, sum2595);
__m512 rcp215 = _mm512_div_ps(_mm512_set1_ps(1e+00f), sum2595);
for (ptrdiff_t i253 = 0; i253 < 62; ++i253) {
__m512 dat2495 = _mm512_maskz_loadu_ps(65535, ptr13+(ptrdiff_t)64*i253);
dat2495 = _mm512_mul_ps(rcp215, dat2495);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*i253, 65535, dat2495);
}
__m512 dat2496 = _mm512_maskz_loadu_ps(255, ptr13+(ptrdiff_t)64*62);
dat2496 = _mm512_mul_ps(rcp215, dat2496);
_mm512_mask_storeu_ps(ptr13+(ptrdiff_t)64*62, 255, dat2496);
}

static __m512 DenseNet121Rsqrt1(__m512 x2) {
__m512 y2 = _mm512_rsqrt14_ps(x2);
__m512 z1 = _mm512_mul_ps(x2, y2);
__m512 a1 = _mm512_mul_ps(y2, _mm512_set1_ps(5e-01f));
__m512 b1 = _mm512_fnmadd_ps(y2, z1, _mm512_set1_ps(3e+00f));
return _mm512_mul_ps(a1, b1);
}

static void DenseNet121BnSimplify1(
float*restrict means1,
float*restrict variances1,
float*restrict scales1,
float*restrict shifts1,
char*restrict mas1
) {
__m512 eps1 = _mm512_set1_ps(1e-05f);
__m512i xlo1 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi1 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
__m512 va1 = _mm512_loadu_ps(variances1+(ptrdiff_t)16*0);
__m512 va2 = _mm512_loadu_ps(variances1+(ptrdiff_t)16*1);
__m512 va3 = _mm512_loadu_ps(variances1+(ptrdiff_t)16*2);
__m512 va4 = _mm512_loadu_ps(variances1+(ptrdiff_t)16*3);
__m512 rcp1 = DenseNet121Rsqrt1(_mm512_add_ps(eps1, va1));
__m512 rcp2 = DenseNet121Rsqrt1(_mm512_add_ps(eps1, va2));
__m512 rcp3 = DenseNet121Rsqrt1(_mm512_add_ps(eps1, va3));
__m512 rcp4 = DenseNet121Rsqrt1(_mm512_add_ps(eps1, va4));
__m512 sc1 = _mm512_loadu_ps(scales1+(ptrdiff_t)16*0);
__m512 sc2 = _mm512_loadu_ps(scales1+(ptrdiff_t)16*1);
__m512 sc3 = _mm512_loadu_ps(scales1+(ptrdiff_t)16*2);
__m512 sc4 = _mm512_loadu_ps(scales1+(ptrdiff_t)16*3);
__m512 mul1 = _mm512_mul_ps(rcp1, sc1);
__m512 mul2 = _mm512_mul_ps(rcp2, sc2);
__m512 mul3 = _mm512_mul_ps(rcp3, sc3);
__m512 mul4 = _mm512_mul_ps(rcp4, sc4);
__m512 me1 = _mm512_loadu_ps(means1+(ptrdiff_t)16*0);
__m512 me2 = _mm512_loadu_ps(means1+(ptrdiff_t)16*1);
__m512 me3 = _mm512_loadu_ps(means1+(ptrdiff_t)16*2);
__m512 me4 = _mm512_loadu_ps(means1+(ptrdiff_t)16*3);
__m512 sh1 = _mm512_loadu_ps(shifts1+(ptrdiff_t)16*0);
__m512 sh2 = _mm512_loadu_ps(shifts1+(ptrdiff_t)16*1);
__m512 sh3 = _mm512_loadu_ps(shifts1+(ptrdiff_t)16*2);
__m512 sh4 = _mm512_loadu_ps(shifts1+(ptrdiff_t)16*3);
__m512 add1 = _mm512_fnmadd_ps(me1, mul1, sh1);
__m512 add2 = _mm512_fnmadd_ps(me2, mul2, sh2);
__m512 add3 = _mm512_fnmadd_ps(me3, mul3, sh3);
__m512 add4 = _mm512_fnmadd_ps(me4, mul4, sh4);
__m512 lo1 = _mm512_permutex2var_ps(mul1, xlo1, add1);
__m512 lo2 = _mm512_permutex2var_ps(mul2, xlo1, add2);
__m512 lo3 = _mm512_permutex2var_ps(mul3, xlo1, add3);
__m512 lo4 = _mm512_permutex2var_ps(mul4, xlo1, add4);
__m512 hi1 = _mm512_permutex2var_ps(mul1, xhi1, add1);
__m512 hi2 = _mm512_permutex2var_ps(mul2, xhi1, add2);
__m512 hi3 = _mm512_permutex2var_ps(mul3, xhi1, add3);
__m512 hi4 = _mm512_permutex2var_ps(mul4, xhi1, add4);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*0, lo1);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*1, hi1);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*2, lo2);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*3, hi2);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*4, lo3);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*5, hi3);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*6, lo4);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*7, hi4);
}

static void DenseNet121BnSimplify2(
float*restrict means2,
float*restrict variances2,
float*restrict scales2,
float*restrict shifts2,
char*restrict mas3
) {
__m512 eps2 = _mm512_set1_ps(1e-05f);
__m512i xlo2 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi2 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i11 = 0; i11 < 1; ++i11) {
__m512 va5 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*0+(ptrdiff_t)80*i11);
__m512 va6 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*1+(ptrdiff_t)80*i11);
__m512 va7 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*2+(ptrdiff_t)80*i11);
__m512 va8 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*3+(ptrdiff_t)80*i11);
__m512 va9 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*4+(ptrdiff_t)80*i11);
__m512 rcp5 = DenseNet121Rsqrt1(_mm512_add_ps(eps2, va5));
__m512 rcp6 = DenseNet121Rsqrt1(_mm512_add_ps(eps2, va6));
__m512 rcp7 = DenseNet121Rsqrt1(_mm512_add_ps(eps2, va7));
__m512 rcp8 = DenseNet121Rsqrt1(_mm512_add_ps(eps2, va8));
__m512 rcp9 = DenseNet121Rsqrt1(_mm512_add_ps(eps2, va9));
__m512 sc5 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*0+(ptrdiff_t)80*i11);
__m512 sc6 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*1+(ptrdiff_t)80*i11);
__m512 sc7 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*2+(ptrdiff_t)80*i11);
__m512 sc8 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*3+(ptrdiff_t)80*i11);
__m512 sc9 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*4+(ptrdiff_t)80*i11);
__m512 mul5 = _mm512_mul_ps(rcp5, sc5);
__m512 mul6 = _mm512_mul_ps(rcp6, sc6);
__m512 mul7 = _mm512_mul_ps(rcp7, sc7);
__m512 mul8 = _mm512_mul_ps(rcp8, sc8);
__m512 mul9 = _mm512_mul_ps(rcp9, sc9);
__m512 me5 = _mm512_loadu_ps(means2+(ptrdiff_t)16*0+(ptrdiff_t)80*i11);
__m512 me6 = _mm512_loadu_ps(means2+(ptrdiff_t)16*1+(ptrdiff_t)80*i11);
__m512 me7 = _mm512_loadu_ps(means2+(ptrdiff_t)16*2+(ptrdiff_t)80*i11);
__m512 me8 = _mm512_loadu_ps(means2+(ptrdiff_t)16*3+(ptrdiff_t)80*i11);
__m512 me9 = _mm512_loadu_ps(means2+(ptrdiff_t)16*4+(ptrdiff_t)80*i11);
__m512 sh5 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*0+(ptrdiff_t)80*i11);
__m512 sh6 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*1+(ptrdiff_t)80*i11);
__m512 sh7 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*2+(ptrdiff_t)80*i11);
__m512 sh8 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*3+(ptrdiff_t)80*i11);
__m512 sh9 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*4+(ptrdiff_t)80*i11);
__m512 add5 = _mm512_fnmadd_ps(me5, mul5, sh5);
__m512 add6 = _mm512_fnmadd_ps(me6, mul6, sh6);
__m512 add7 = _mm512_fnmadd_ps(me7, mul7, sh7);
__m512 add8 = _mm512_fnmadd_ps(me8, mul8, sh8);
__m512 add9 = _mm512_fnmadd_ps(me9, mul9, sh9);
__m512 lo5 = _mm512_permutex2var_ps(mul5, xlo2, add5);
__m512 lo6 = _mm512_permutex2var_ps(mul6, xlo2, add6);
__m512 lo7 = _mm512_permutex2var_ps(mul7, xlo2, add7);
__m512 lo8 = _mm512_permutex2var_ps(mul8, xlo2, add8);
__m512 lo9 = _mm512_permutex2var_ps(mul9, xlo2, add9);
__m512 hi5 = _mm512_permutex2var_ps(mul5, xhi2, add5);
__m512 hi6 = _mm512_permutex2var_ps(mul6, xhi2, add6);
__m512 hi7 = _mm512_permutex2var_ps(mul7, xhi2, add7);
__m512 hi8 = _mm512_permutex2var_ps(mul8, xhi2, add8);
__m512 hi9 = _mm512_permutex2var_ps(mul9, xhi2, add9);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*0+(ptrdiff_t)640*i11, lo5);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*1+(ptrdiff_t)640*i11, hi5);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*2+(ptrdiff_t)640*i11, lo6);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*3+(ptrdiff_t)640*i11, hi6);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*4+(ptrdiff_t)640*i11, lo7);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*5+(ptrdiff_t)640*i11, hi7);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*6+(ptrdiff_t)640*i11, lo8);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*7+(ptrdiff_t)640*i11, hi8);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*8+(ptrdiff_t)640*i11, lo9);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*9+(ptrdiff_t)640*i11, hi9);
}
__m512 va10 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 va11 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*1+(ptrdiff_t)80*1);
__m512 va12 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*2+(ptrdiff_t)80*1);
__m512 rcp10 = DenseNet121Rsqrt1(_mm512_add_ps(eps2, va10));
__m512 rcp11 = DenseNet121Rsqrt1(_mm512_add_ps(eps2, va11));
__m512 rcp12 = DenseNet121Rsqrt1(_mm512_add_ps(eps2, va12));
__m512 sc10 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 sc11 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*1+(ptrdiff_t)80*1);
__m512 sc12 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*2+(ptrdiff_t)80*1);
__m512 mul10 = _mm512_mul_ps(rcp10, sc10);
__m512 mul11 = _mm512_mul_ps(rcp11, sc11);
__m512 mul12 = _mm512_mul_ps(rcp12, sc12);
__m512 me10 = _mm512_loadu_ps(means2+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 me11 = _mm512_loadu_ps(means2+(ptrdiff_t)16*1+(ptrdiff_t)80*1);
__m512 me12 = _mm512_loadu_ps(means2+(ptrdiff_t)16*2+(ptrdiff_t)80*1);
__m512 sh10 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 sh11 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*1+(ptrdiff_t)80*1);
__m512 sh12 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*2+(ptrdiff_t)80*1);
__m512 add10 = _mm512_fnmadd_ps(me10, mul10, sh10);
__m512 add11 = _mm512_fnmadd_ps(me11, mul11, sh11);
__m512 add12 = _mm512_fnmadd_ps(me12, mul12, sh12);
__m512 lo10 = _mm512_permutex2var_ps(mul10, xlo2, add10);
__m512 lo11 = _mm512_permutex2var_ps(mul11, xlo2, add11);
__m512 lo12 = _mm512_permutex2var_ps(mul12, xlo2, add12);
__m512 hi10 = _mm512_permutex2var_ps(mul10, xhi2, add10);
__m512 hi11 = _mm512_permutex2var_ps(mul11, xhi2, add11);
__m512 hi12 = _mm512_permutex2var_ps(mul12, xhi2, add12);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*0+(ptrdiff_t)640*1, lo10);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*1+(ptrdiff_t)640*1, hi10);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*2+(ptrdiff_t)640*1, lo11);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*3+(ptrdiff_t)640*1, hi11);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*4+(ptrdiff_t)640*1, lo12);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*5+(ptrdiff_t)640*1, hi12);
}

static void DenseNet121BnSimplify3(
float*restrict means3,
float*restrict variances3,
float*restrict scales3,
float*restrict shifts3,
char*restrict mas4
) {
__m512 eps3 = _mm512_set1_ps(1e-05f);
__m512i xlo3 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi3 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i19 = 0; i19 < 1; ++i19) {
__m512 va13 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*0+(ptrdiff_t)80*i19);
__m512 va14 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*1+(ptrdiff_t)80*i19);
__m512 va15 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*2+(ptrdiff_t)80*i19);
__m512 va16 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*3+(ptrdiff_t)80*i19);
__m512 va17 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*4+(ptrdiff_t)80*i19);
__m512 rcp13 = DenseNet121Rsqrt1(_mm512_add_ps(eps3, va13));
__m512 rcp14 = DenseNet121Rsqrt1(_mm512_add_ps(eps3, va14));
__m512 rcp15 = DenseNet121Rsqrt1(_mm512_add_ps(eps3, va15));
__m512 rcp16 = DenseNet121Rsqrt1(_mm512_add_ps(eps3, va16));
__m512 rcp17 = DenseNet121Rsqrt1(_mm512_add_ps(eps3, va17));
__m512 sc13 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*0+(ptrdiff_t)80*i19);
__m512 sc14 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*1+(ptrdiff_t)80*i19);
__m512 sc15 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*2+(ptrdiff_t)80*i19);
__m512 sc16 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*3+(ptrdiff_t)80*i19);
__m512 sc17 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*4+(ptrdiff_t)80*i19);
__m512 mul13 = _mm512_mul_ps(rcp13, sc13);
__m512 mul14 = _mm512_mul_ps(rcp14, sc14);
__m512 mul15 = _mm512_mul_ps(rcp15, sc15);
__m512 mul16 = _mm512_mul_ps(rcp16, sc16);
__m512 mul17 = _mm512_mul_ps(rcp17, sc17);
__m512 me13 = _mm512_loadu_ps(means3+(ptrdiff_t)16*0+(ptrdiff_t)80*i19);
__m512 me14 = _mm512_loadu_ps(means3+(ptrdiff_t)16*1+(ptrdiff_t)80*i19);
__m512 me15 = _mm512_loadu_ps(means3+(ptrdiff_t)16*2+(ptrdiff_t)80*i19);
__m512 me16 = _mm512_loadu_ps(means3+(ptrdiff_t)16*3+(ptrdiff_t)80*i19);
__m512 me17 = _mm512_loadu_ps(means3+(ptrdiff_t)16*4+(ptrdiff_t)80*i19);
__m512 sh13 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*0+(ptrdiff_t)80*i19);
__m512 sh14 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*1+(ptrdiff_t)80*i19);
__m512 sh15 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*2+(ptrdiff_t)80*i19);
__m512 sh16 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*3+(ptrdiff_t)80*i19);
__m512 sh17 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*4+(ptrdiff_t)80*i19);
__m512 add13 = _mm512_fnmadd_ps(me13, mul13, sh13);
__m512 add14 = _mm512_fnmadd_ps(me14, mul14, sh14);
__m512 add15 = _mm512_fnmadd_ps(me15, mul15, sh15);
__m512 add16 = _mm512_fnmadd_ps(me16, mul16, sh16);
__m512 add17 = _mm512_fnmadd_ps(me17, mul17, sh17);
__m512 lo13 = _mm512_permutex2var_ps(mul13, xlo3, add13);
__m512 lo14 = _mm512_permutex2var_ps(mul14, xlo3, add14);
__m512 lo15 = _mm512_permutex2var_ps(mul15, xlo3, add15);
__m512 lo16 = _mm512_permutex2var_ps(mul16, xlo3, add16);
__m512 lo17 = _mm512_permutex2var_ps(mul17, xlo3, add17);
__m512 hi13 = _mm512_permutex2var_ps(mul13, xhi3, add13);
__m512 hi14 = _mm512_permutex2var_ps(mul14, xhi3, add14);
__m512 hi15 = _mm512_permutex2var_ps(mul15, xhi3, add15);
__m512 hi16 = _mm512_permutex2var_ps(mul16, xhi3, add16);
__m512 hi17 = _mm512_permutex2var_ps(mul17, xhi3, add17);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*0+(ptrdiff_t)640*i19, lo13);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*1+(ptrdiff_t)640*i19, hi13);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*2+(ptrdiff_t)640*i19, lo14);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*3+(ptrdiff_t)640*i19, hi14);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*4+(ptrdiff_t)640*i19, lo15);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*5+(ptrdiff_t)640*i19, hi15);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*6+(ptrdiff_t)640*i19, lo16);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*7+(ptrdiff_t)640*i19, hi16);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*8+(ptrdiff_t)640*i19, lo17);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*9+(ptrdiff_t)640*i19, hi17);
}
__m512 va18 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 rcp18 = DenseNet121Rsqrt1(_mm512_add_ps(eps3, va18));
__m512 sc18 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 mul18 = _mm512_mul_ps(rcp18, sc18);
__m512 me18 = _mm512_loadu_ps(means3+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 sh18 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 add18 = _mm512_fnmadd_ps(me18, mul18, sh18);
__m512 lo18 = _mm512_permutex2var_ps(mul18, xlo3, add18);
__m512 hi18 = _mm512_permutex2var_ps(mul18, xhi3, add18);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*0+(ptrdiff_t)640*1, lo18);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*1+(ptrdiff_t)640*1, hi18);
}

static void DenseNet121BnSimplify4(
float*restrict means4,
float*restrict variances4,
float*restrict scales4,
float*restrict shifts4,
char*restrict mas5
) {
__m512 eps4 = _mm512_set1_ps(1e-05f);
__m512i xlo4 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi4 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i26 = 0; i26 < 2; ++i26) {
__m512 va19 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*0+(ptrdiff_t)80*i26);
__m512 va20 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*1+(ptrdiff_t)80*i26);
__m512 va21 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*2+(ptrdiff_t)80*i26);
__m512 va22 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*3+(ptrdiff_t)80*i26);
__m512 va23 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*4+(ptrdiff_t)80*i26);
__m512 rcp19 = DenseNet121Rsqrt1(_mm512_add_ps(eps4, va19));
__m512 rcp20 = DenseNet121Rsqrt1(_mm512_add_ps(eps4, va20));
__m512 rcp21 = DenseNet121Rsqrt1(_mm512_add_ps(eps4, va21));
__m512 rcp22 = DenseNet121Rsqrt1(_mm512_add_ps(eps4, va22));
__m512 rcp23 = DenseNet121Rsqrt1(_mm512_add_ps(eps4, va23));
__m512 sc19 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*0+(ptrdiff_t)80*i26);
__m512 sc20 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*1+(ptrdiff_t)80*i26);
__m512 sc21 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*2+(ptrdiff_t)80*i26);
__m512 sc22 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*3+(ptrdiff_t)80*i26);
__m512 sc23 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*4+(ptrdiff_t)80*i26);
__m512 mul19 = _mm512_mul_ps(rcp19, sc19);
__m512 mul20 = _mm512_mul_ps(rcp20, sc20);
__m512 mul21 = _mm512_mul_ps(rcp21, sc21);
__m512 mul22 = _mm512_mul_ps(rcp22, sc22);
__m512 mul23 = _mm512_mul_ps(rcp23, sc23);
__m512 me19 = _mm512_loadu_ps(means4+(ptrdiff_t)16*0+(ptrdiff_t)80*i26);
__m512 me20 = _mm512_loadu_ps(means4+(ptrdiff_t)16*1+(ptrdiff_t)80*i26);
__m512 me21 = _mm512_loadu_ps(means4+(ptrdiff_t)16*2+(ptrdiff_t)80*i26);
__m512 me22 = _mm512_loadu_ps(means4+(ptrdiff_t)16*3+(ptrdiff_t)80*i26);
__m512 me23 = _mm512_loadu_ps(means4+(ptrdiff_t)16*4+(ptrdiff_t)80*i26);
__m512 sh19 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*0+(ptrdiff_t)80*i26);
__m512 sh20 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*1+(ptrdiff_t)80*i26);
__m512 sh21 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*2+(ptrdiff_t)80*i26);
__m512 sh22 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*3+(ptrdiff_t)80*i26);
__m512 sh23 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*4+(ptrdiff_t)80*i26);
__m512 add19 = _mm512_fnmadd_ps(me19, mul19, sh19);
__m512 add20 = _mm512_fnmadd_ps(me20, mul20, sh20);
__m512 add21 = _mm512_fnmadd_ps(me21, mul21, sh21);
__m512 add22 = _mm512_fnmadd_ps(me22, mul22, sh22);
__m512 add23 = _mm512_fnmadd_ps(me23, mul23, sh23);
__m512 lo19 = _mm512_permutex2var_ps(mul19, xlo4, add19);
__m512 lo20 = _mm512_permutex2var_ps(mul20, xlo4, add20);
__m512 lo21 = _mm512_permutex2var_ps(mul21, xlo4, add21);
__m512 lo22 = _mm512_permutex2var_ps(mul22, xlo4, add22);
__m512 lo23 = _mm512_permutex2var_ps(mul23, xlo4, add23);
__m512 hi19 = _mm512_permutex2var_ps(mul19, xhi4, add19);
__m512 hi20 = _mm512_permutex2var_ps(mul20, xhi4, add20);
__m512 hi21 = _mm512_permutex2var_ps(mul21, xhi4, add21);
__m512 hi22 = _mm512_permutex2var_ps(mul22, xhi4, add22);
__m512 hi23 = _mm512_permutex2var_ps(mul23, xhi4, add23);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*0+(ptrdiff_t)640*i26, lo19);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*1+(ptrdiff_t)640*i26, hi19);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*2+(ptrdiff_t)640*i26, lo20);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*3+(ptrdiff_t)640*i26, hi20);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*4+(ptrdiff_t)640*i26, lo21);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*5+(ptrdiff_t)640*i26, hi21);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*6+(ptrdiff_t)640*i26, lo22);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*7+(ptrdiff_t)640*i26, hi22);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*8+(ptrdiff_t)640*i26, lo23);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*9+(ptrdiff_t)640*i26, hi23);
}
}

static void DenseNet121BnSimplify5(
float*restrict means5,
float*restrict variances5,
float*restrict scales5,
float*restrict shifts5,
char*restrict mas6
) {
__m512 eps5 = _mm512_set1_ps(1e-05f);
__m512i xlo5 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi5 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i30 = 0; i30 < 2; ++i30) {
__m512 va24 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*0+(ptrdiff_t)80*i30);
__m512 va25 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*1+(ptrdiff_t)80*i30);
__m512 va26 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*2+(ptrdiff_t)80*i30);
__m512 va27 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*3+(ptrdiff_t)80*i30);
__m512 va28 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*4+(ptrdiff_t)80*i30);
__m512 rcp24 = DenseNet121Rsqrt1(_mm512_add_ps(eps5, va24));
__m512 rcp25 = DenseNet121Rsqrt1(_mm512_add_ps(eps5, va25));
__m512 rcp26 = DenseNet121Rsqrt1(_mm512_add_ps(eps5, va26));
__m512 rcp27 = DenseNet121Rsqrt1(_mm512_add_ps(eps5, va27));
__m512 rcp28 = DenseNet121Rsqrt1(_mm512_add_ps(eps5, va28));
__m512 sc24 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*0+(ptrdiff_t)80*i30);
__m512 sc25 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*1+(ptrdiff_t)80*i30);
__m512 sc26 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*2+(ptrdiff_t)80*i30);
__m512 sc27 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*3+(ptrdiff_t)80*i30);
__m512 sc28 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*4+(ptrdiff_t)80*i30);
__m512 mul24 = _mm512_mul_ps(rcp24, sc24);
__m512 mul25 = _mm512_mul_ps(rcp25, sc25);
__m512 mul26 = _mm512_mul_ps(rcp26, sc26);
__m512 mul27 = _mm512_mul_ps(rcp27, sc27);
__m512 mul28 = _mm512_mul_ps(rcp28, sc28);
__m512 me24 = _mm512_loadu_ps(means5+(ptrdiff_t)16*0+(ptrdiff_t)80*i30);
__m512 me25 = _mm512_loadu_ps(means5+(ptrdiff_t)16*1+(ptrdiff_t)80*i30);
__m512 me26 = _mm512_loadu_ps(means5+(ptrdiff_t)16*2+(ptrdiff_t)80*i30);
__m512 me27 = _mm512_loadu_ps(means5+(ptrdiff_t)16*3+(ptrdiff_t)80*i30);
__m512 me28 = _mm512_loadu_ps(means5+(ptrdiff_t)16*4+(ptrdiff_t)80*i30);
__m512 sh24 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*0+(ptrdiff_t)80*i30);
__m512 sh25 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*1+(ptrdiff_t)80*i30);
__m512 sh26 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*2+(ptrdiff_t)80*i30);
__m512 sh27 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*3+(ptrdiff_t)80*i30);
__m512 sh28 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*4+(ptrdiff_t)80*i30);
__m512 add24 = _mm512_fnmadd_ps(me24, mul24, sh24);
__m512 add25 = _mm512_fnmadd_ps(me25, mul25, sh25);
__m512 add26 = _mm512_fnmadd_ps(me26, mul26, sh26);
__m512 add27 = _mm512_fnmadd_ps(me27, mul27, sh27);
__m512 add28 = _mm512_fnmadd_ps(me28, mul28, sh28);
__m512 lo24 = _mm512_permutex2var_ps(mul24, xlo5, add24);
__m512 lo25 = _mm512_permutex2var_ps(mul25, xlo5, add25);
__m512 lo26 = _mm512_permutex2var_ps(mul26, xlo5, add26);
__m512 lo27 = _mm512_permutex2var_ps(mul27, xlo5, add27);
__m512 lo28 = _mm512_permutex2var_ps(mul28, xlo5, add28);
__m512 hi24 = _mm512_permutex2var_ps(mul24, xhi5, add24);
__m512 hi25 = _mm512_permutex2var_ps(mul25, xhi5, add25);
__m512 hi26 = _mm512_permutex2var_ps(mul26, xhi5, add26);
__m512 hi27 = _mm512_permutex2var_ps(mul27, xhi5, add27);
__m512 hi28 = _mm512_permutex2var_ps(mul28, xhi5, add28);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*0+(ptrdiff_t)640*i30, lo24);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*1+(ptrdiff_t)640*i30, hi24);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*2+(ptrdiff_t)640*i30, lo25);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*3+(ptrdiff_t)640*i30, hi25);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*4+(ptrdiff_t)640*i30, lo26);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*5+(ptrdiff_t)640*i30, hi26);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*6+(ptrdiff_t)640*i30, lo27);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*7+(ptrdiff_t)640*i30, hi27);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*8+(ptrdiff_t)640*i30, lo28);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*9+(ptrdiff_t)640*i30, hi28);
}
__m512 va29 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*0+(ptrdiff_t)80*2);
__m512 va30 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*1+(ptrdiff_t)80*2);
__m512 rcp29 = DenseNet121Rsqrt1(_mm512_add_ps(eps5, va29));
__m512 rcp30 = DenseNet121Rsqrt1(_mm512_add_ps(eps5, va30));
__m512 sc29 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*0+(ptrdiff_t)80*2);
__m512 sc30 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*1+(ptrdiff_t)80*2);
__m512 mul29 = _mm512_mul_ps(rcp29, sc29);
__m512 mul30 = _mm512_mul_ps(rcp30, sc30);
__m512 me29 = _mm512_loadu_ps(means5+(ptrdiff_t)16*0+(ptrdiff_t)80*2);
__m512 me30 = _mm512_loadu_ps(means5+(ptrdiff_t)16*1+(ptrdiff_t)80*2);
__m512 sh29 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*0+(ptrdiff_t)80*2);
__m512 sh30 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*1+(ptrdiff_t)80*2);
__m512 add29 = _mm512_fnmadd_ps(me29, mul29, sh29);
__m512 add30 = _mm512_fnmadd_ps(me30, mul30, sh30);
__m512 lo29 = _mm512_permutex2var_ps(mul29, xlo5, add29);
__m512 lo30 = _mm512_permutex2var_ps(mul30, xlo5, add30);
__m512 hi29 = _mm512_permutex2var_ps(mul29, xhi5, add29);
__m512 hi30 = _mm512_permutex2var_ps(mul30, xhi5, add30);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*0+(ptrdiff_t)640*2, lo29);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*1+(ptrdiff_t)640*2, hi29);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*2+(ptrdiff_t)640*2, lo30);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*3+(ptrdiff_t)640*2, hi30);
}

static void DenseNet121BnSimplify6(
float*restrict means6,
float*restrict variances6,
float*restrict scales6,
float*restrict shifts6,
char*restrict mas7
) {
__m512 eps6 = _mm512_set1_ps(1e-05f);
__m512i xlo6 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi6 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i34 = 0; i34 < 2; ++i34) {
__m512 va31 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*0+(ptrdiff_t)80*i34);
__m512 va32 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*1+(ptrdiff_t)80*i34);
__m512 va33 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*2+(ptrdiff_t)80*i34);
__m512 va34 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*3+(ptrdiff_t)80*i34);
__m512 va35 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*4+(ptrdiff_t)80*i34);
__m512 rcp31 = DenseNet121Rsqrt1(_mm512_add_ps(eps6, va31));
__m512 rcp32 = DenseNet121Rsqrt1(_mm512_add_ps(eps6, va32));
__m512 rcp33 = DenseNet121Rsqrt1(_mm512_add_ps(eps6, va33));
__m512 rcp34 = DenseNet121Rsqrt1(_mm512_add_ps(eps6, va34));
__m512 rcp35 = DenseNet121Rsqrt1(_mm512_add_ps(eps6, va35));
__m512 sc31 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*0+(ptrdiff_t)80*i34);
__m512 sc32 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*1+(ptrdiff_t)80*i34);
__m512 sc33 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*2+(ptrdiff_t)80*i34);
__m512 sc34 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*3+(ptrdiff_t)80*i34);
__m512 sc35 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*4+(ptrdiff_t)80*i34);
__m512 mul31 = _mm512_mul_ps(rcp31, sc31);
__m512 mul32 = _mm512_mul_ps(rcp32, sc32);
__m512 mul33 = _mm512_mul_ps(rcp33, sc33);
__m512 mul34 = _mm512_mul_ps(rcp34, sc34);
__m512 mul35 = _mm512_mul_ps(rcp35, sc35);
__m512 me31 = _mm512_loadu_ps(means6+(ptrdiff_t)16*0+(ptrdiff_t)80*i34);
__m512 me32 = _mm512_loadu_ps(means6+(ptrdiff_t)16*1+(ptrdiff_t)80*i34);
__m512 me33 = _mm512_loadu_ps(means6+(ptrdiff_t)16*2+(ptrdiff_t)80*i34);
__m512 me34 = _mm512_loadu_ps(means6+(ptrdiff_t)16*3+(ptrdiff_t)80*i34);
__m512 me35 = _mm512_loadu_ps(means6+(ptrdiff_t)16*4+(ptrdiff_t)80*i34);
__m512 sh31 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*0+(ptrdiff_t)80*i34);
__m512 sh32 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*1+(ptrdiff_t)80*i34);
__m512 sh33 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*2+(ptrdiff_t)80*i34);
__m512 sh34 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*3+(ptrdiff_t)80*i34);
__m512 sh35 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*4+(ptrdiff_t)80*i34);
__m512 add31 = _mm512_fnmadd_ps(me31, mul31, sh31);
__m512 add32 = _mm512_fnmadd_ps(me32, mul32, sh32);
__m512 add33 = _mm512_fnmadd_ps(me33, mul33, sh33);
__m512 add34 = _mm512_fnmadd_ps(me34, mul34, sh34);
__m512 add35 = _mm512_fnmadd_ps(me35, mul35, sh35);
__m512 lo31 = _mm512_permutex2var_ps(mul31, xlo6, add31);
__m512 lo32 = _mm512_permutex2var_ps(mul32, xlo6, add32);
__m512 lo33 = _mm512_permutex2var_ps(mul33, xlo6, add33);
__m512 lo34 = _mm512_permutex2var_ps(mul34, xlo6, add34);
__m512 lo35 = _mm512_permutex2var_ps(mul35, xlo6, add35);
__m512 hi31 = _mm512_permutex2var_ps(mul31, xhi6, add31);
__m512 hi32 = _mm512_permutex2var_ps(mul32, xhi6, add32);
__m512 hi33 = _mm512_permutex2var_ps(mul33, xhi6, add33);
__m512 hi34 = _mm512_permutex2var_ps(mul34, xhi6, add34);
__m512 hi35 = _mm512_permutex2var_ps(mul35, xhi6, add35);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*0+(ptrdiff_t)640*i34, lo31);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*1+(ptrdiff_t)640*i34, hi31);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*2+(ptrdiff_t)640*i34, lo32);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*3+(ptrdiff_t)640*i34, hi32);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*4+(ptrdiff_t)640*i34, lo33);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*5+(ptrdiff_t)640*i34, hi33);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*6+(ptrdiff_t)640*i34, lo34);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*7+(ptrdiff_t)640*i34, hi34);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*8+(ptrdiff_t)640*i34, lo35);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*9+(ptrdiff_t)640*i34, hi35);
}
__m512 va36 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*0+(ptrdiff_t)80*2);
__m512 va37 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*1+(ptrdiff_t)80*2);
__m512 va38 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*2+(ptrdiff_t)80*2);
__m512 va39 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*3+(ptrdiff_t)80*2);
__m512 rcp36 = DenseNet121Rsqrt1(_mm512_add_ps(eps6, va36));
__m512 rcp37 = DenseNet121Rsqrt1(_mm512_add_ps(eps6, va37));
__m512 rcp38 = DenseNet121Rsqrt1(_mm512_add_ps(eps6, va38));
__m512 rcp39 = DenseNet121Rsqrt1(_mm512_add_ps(eps6, va39));
__m512 sc36 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*0+(ptrdiff_t)80*2);
__m512 sc37 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*1+(ptrdiff_t)80*2);
__m512 sc38 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*2+(ptrdiff_t)80*2);
__m512 sc39 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*3+(ptrdiff_t)80*2);
__m512 mul36 = _mm512_mul_ps(rcp36, sc36);
__m512 mul37 = _mm512_mul_ps(rcp37, sc37);
__m512 mul38 = _mm512_mul_ps(rcp38, sc38);
__m512 mul39 = _mm512_mul_ps(rcp39, sc39);
__m512 me36 = _mm512_loadu_ps(means6+(ptrdiff_t)16*0+(ptrdiff_t)80*2);
__m512 me37 = _mm512_loadu_ps(means6+(ptrdiff_t)16*1+(ptrdiff_t)80*2);
__m512 me38 = _mm512_loadu_ps(means6+(ptrdiff_t)16*2+(ptrdiff_t)80*2);
__m512 me39 = _mm512_loadu_ps(means6+(ptrdiff_t)16*3+(ptrdiff_t)80*2);
__m512 sh36 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*0+(ptrdiff_t)80*2);
__m512 sh37 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*1+(ptrdiff_t)80*2);
__m512 sh38 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*2+(ptrdiff_t)80*2);
__m512 sh39 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*3+(ptrdiff_t)80*2);
__m512 add36 = _mm512_fnmadd_ps(me36, mul36, sh36);
__m512 add37 = _mm512_fnmadd_ps(me37, mul37, sh37);
__m512 add38 = _mm512_fnmadd_ps(me38, mul38, sh38);
__m512 add39 = _mm512_fnmadd_ps(me39, mul39, sh39);
__m512 lo36 = _mm512_permutex2var_ps(mul36, xlo6, add36);
__m512 lo37 = _mm512_permutex2var_ps(mul37, xlo6, add37);
__m512 lo38 = _mm512_permutex2var_ps(mul38, xlo6, add38);
__m512 lo39 = _mm512_permutex2var_ps(mul39, xlo6, add39);
__m512 hi36 = _mm512_permutex2var_ps(mul36, xhi6, add36);
__m512 hi37 = _mm512_permutex2var_ps(mul37, xhi6, add37);
__m512 hi38 = _mm512_permutex2var_ps(mul38, xhi6, add38);
__m512 hi39 = _mm512_permutex2var_ps(mul39, xhi6, add39);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*0+(ptrdiff_t)640*2, lo36);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*1+(ptrdiff_t)640*2, hi36);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*2+(ptrdiff_t)640*2, lo37);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*3+(ptrdiff_t)640*2, hi37);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*4+(ptrdiff_t)640*2, lo38);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*5+(ptrdiff_t)640*2, hi38);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*6+(ptrdiff_t)640*2, lo39);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*7+(ptrdiff_t)640*2, hi39);
}

static void DenseNet121BnSimplify7(
float*restrict means7,
float*restrict variances7,
float*restrict scales7,
float*restrict shifts7,
char*restrict mas8
) {
__m512 eps7 = _mm512_set1_ps(1e-05f);
__m512i xlo7 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi7 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i38 = 0; i38 < 3; ++i38) {
__m512 va40 = _mm512_loadu_ps(variances7+(ptrdiff_t)16*0+(ptrdiff_t)80*i38);
__m512 va41 = _mm512_loadu_ps(variances7+(ptrdiff_t)16*1+(ptrdiff_t)80*i38);
__m512 va42 = _mm512_loadu_ps(variances7+(ptrdiff_t)16*2+(ptrdiff_t)80*i38);
__m512 va43 = _mm512_loadu_ps(variances7+(ptrdiff_t)16*3+(ptrdiff_t)80*i38);
__m512 va44 = _mm512_loadu_ps(variances7+(ptrdiff_t)16*4+(ptrdiff_t)80*i38);
__m512 rcp40 = DenseNet121Rsqrt1(_mm512_add_ps(eps7, va40));
__m512 rcp41 = DenseNet121Rsqrt1(_mm512_add_ps(eps7, va41));
__m512 rcp42 = DenseNet121Rsqrt1(_mm512_add_ps(eps7, va42));
__m512 rcp43 = DenseNet121Rsqrt1(_mm512_add_ps(eps7, va43));
__m512 rcp44 = DenseNet121Rsqrt1(_mm512_add_ps(eps7, va44));
__m512 sc40 = _mm512_loadu_ps(scales7+(ptrdiff_t)16*0+(ptrdiff_t)80*i38);
__m512 sc41 = _mm512_loadu_ps(scales7+(ptrdiff_t)16*1+(ptrdiff_t)80*i38);
__m512 sc42 = _mm512_loadu_ps(scales7+(ptrdiff_t)16*2+(ptrdiff_t)80*i38);
__m512 sc43 = _mm512_loadu_ps(scales7+(ptrdiff_t)16*3+(ptrdiff_t)80*i38);
__m512 sc44 = _mm512_loadu_ps(scales7+(ptrdiff_t)16*4+(ptrdiff_t)80*i38);
__m512 mul40 = _mm512_mul_ps(rcp40, sc40);
__m512 mul41 = _mm512_mul_ps(rcp41, sc41);
__m512 mul42 = _mm512_mul_ps(rcp42, sc42);
__m512 mul43 = _mm512_mul_ps(rcp43, sc43);
__m512 mul44 = _mm512_mul_ps(rcp44, sc44);
__m512 me40 = _mm512_loadu_ps(means7+(ptrdiff_t)16*0+(ptrdiff_t)80*i38);
__m512 me41 = _mm512_loadu_ps(means7+(ptrdiff_t)16*1+(ptrdiff_t)80*i38);
__m512 me42 = _mm512_loadu_ps(means7+(ptrdiff_t)16*2+(ptrdiff_t)80*i38);
__m512 me43 = _mm512_loadu_ps(means7+(ptrdiff_t)16*3+(ptrdiff_t)80*i38);
__m512 me44 = _mm512_loadu_ps(means7+(ptrdiff_t)16*4+(ptrdiff_t)80*i38);
__m512 sh40 = _mm512_loadu_ps(shifts7+(ptrdiff_t)16*0+(ptrdiff_t)80*i38);
__m512 sh41 = _mm512_loadu_ps(shifts7+(ptrdiff_t)16*1+(ptrdiff_t)80*i38);
__m512 sh42 = _mm512_loadu_ps(shifts7+(ptrdiff_t)16*2+(ptrdiff_t)80*i38);
__m512 sh43 = _mm512_loadu_ps(shifts7+(ptrdiff_t)16*3+(ptrdiff_t)80*i38);
__m512 sh44 = _mm512_loadu_ps(shifts7+(ptrdiff_t)16*4+(ptrdiff_t)80*i38);
__m512 add40 = _mm512_fnmadd_ps(me40, mul40, sh40);
__m512 add41 = _mm512_fnmadd_ps(me41, mul41, sh41);
__m512 add42 = _mm512_fnmadd_ps(me42, mul42, sh42);
__m512 add43 = _mm512_fnmadd_ps(me43, mul43, sh43);
__m512 add44 = _mm512_fnmadd_ps(me44, mul44, sh44);
__m512 lo40 = _mm512_permutex2var_ps(mul40, xlo7, add40);
__m512 lo41 = _mm512_permutex2var_ps(mul41, xlo7, add41);
__m512 lo42 = _mm512_permutex2var_ps(mul42, xlo7, add42);
__m512 lo43 = _mm512_permutex2var_ps(mul43, xlo7, add43);
__m512 lo44 = _mm512_permutex2var_ps(mul44, xlo7, add44);
__m512 hi40 = _mm512_permutex2var_ps(mul40, xhi7, add40);
__m512 hi41 = _mm512_permutex2var_ps(mul41, xhi7, add41);
__m512 hi42 = _mm512_permutex2var_ps(mul42, xhi7, add42);
__m512 hi43 = _mm512_permutex2var_ps(mul43, xhi7, add43);
__m512 hi44 = _mm512_permutex2var_ps(mul44, xhi7, add44);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*0+(ptrdiff_t)640*i38, lo40);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*1+(ptrdiff_t)640*i38, hi40);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*2+(ptrdiff_t)640*i38, lo41);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*3+(ptrdiff_t)640*i38, hi41);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*4+(ptrdiff_t)640*i38, lo42);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*5+(ptrdiff_t)640*i38, hi42);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*6+(ptrdiff_t)640*i38, lo43);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*7+(ptrdiff_t)640*i38, hi43);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*8+(ptrdiff_t)640*i38, lo44);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*9+(ptrdiff_t)640*i38, hi44);
}
__m512 va45 = _mm512_loadu_ps(variances7+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 rcp45 = DenseNet121Rsqrt1(_mm512_add_ps(eps7, va45));
__m512 sc45 = _mm512_loadu_ps(scales7+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 mul45 = _mm512_mul_ps(rcp45, sc45);
__m512 me45 = _mm512_loadu_ps(means7+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 sh45 = _mm512_loadu_ps(shifts7+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 add45 = _mm512_fnmadd_ps(me45, mul45, sh45);
__m512 lo45 = _mm512_permutex2var_ps(mul45, xlo7, add45);
__m512 hi45 = _mm512_permutex2var_ps(mul45, xhi7, add45);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*0+(ptrdiff_t)640*3, lo45);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*1+(ptrdiff_t)640*3, hi45);
}

static void DenseNet121BnSimplify8(
float*restrict means8,
float*restrict variances8,
float*restrict scales8,
float*restrict shifts8,
char*restrict mas9
) {
__m512 eps8 = _mm512_set1_ps(1e-05f);
__m512i xlo8 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi8 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i63 = 0; i63 < 3; ++i63) {
__m512 va46 = _mm512_loadu_ps(variances8+(ptrdiff_t)16*0+(ptrdiff_t)80*i63);
__m512 va47 = _mm512_loadu_ps(variances8+(ptrdiff_t)16*1+(ptrdiff_t)80*i63);
__m512 va48 = _mm512_loadu_ps(variances8+(ptrdiff_t)16*2+(ptrdiff_t)80*i63);
__m512 va49 = _mm512_loadu_ps(variances8+(ptrdiff_t)16*3+(ptrdiff_t)80*i63);
__m512 va50 = _mm512_loadu_ps(variances8+(ptrdiff_t)16*4+(ptrdiff_t)80*i63);
__m512 rcp46 = DenseNet121Rsqrt1(_mm512_add_ps(eps8, va46));
__m512 rcp47 = DenseNet121Rsqrt1(_mm512_add_ps(eps8, va47));
__m512 rcp48 = DenseNet121Rsqrt1(_mm512_add_ps(eps8, va48));
__m512 rcp49 = DenseNet121Rsqrt1(_mm512_add_ps(eps8, va49));
__m512 rcp50 = DenseNet121Rsqrt1(_mm512_add_ps(eps8, va50));
__m512 sc46 = _mm512_loadu_ps(scales8+(ptrdiff_t)16*0+(ptrdiff_t)80*i63);
__m512 sc47 = _mm512_loadu_ps(scales8+(ptrdiff_t)16*1+(ptrdiff_t)80*i63);
__m512 sc48 = _mm512_loadu_ps(scales8+(ptrdiff_t)16*2+(ptrdiff_t)80*i63);
__m512 sc49 = _mm512_loadu_ps(scales8+(ptrdiff_t)16*3+(ptrdiff_t)80*i63);
__m512 sc50 = _mm512_loadu_ps(scales8+(ptrdiff_t)16*4+(ptrdiff_t)80*i63);
__m512 mul46 = _mm512_mul_ps(rcp46, sc46);
__m512 mul47 = _mm512_mul_ps(rcp47, sc47);
__m512 mul48 = _mm512_mul_ps(rcp48, sc48);
__m512 mul49 = _mm512_mul_ps(rcp49, sc49);
__m512 mul50 = _mm512_mul_ps(rcp50, sc50);
__m512 me46 = _mm512_loadu_ps(means8+(ptrdiff_t)16*0+(ptrdiff_t)80*i63);
__m512 me47 = _mm512_loadu_ps(means8+(ptrdiff_t)16*1+(ptrdiff_t)80*i63);
__m512 me48 = _mm512_loadu_ps(means8+(ptrdiff_t)16*2+(ptrdiff_t)80*i63);
__m512 me49 = _mm512_loadu_ps(means8+(ptrdiff_t)16*3+(ptrdiff_t)80*i63);
__m512 me50 = _mm512_loadu_ps(means8+(ptrdiff_t)16*4+(ptrdiff_t)80*i63);
__m512 sh46 = _mm512_loadu_ps(shifts8+(ptrdiff_t)16*0+(ptrdiff_t)80*i63);
__m512 sh47 = _mm512_loadu_ps(shifts8+(ptrdiff_t)16*1+(ptrdiff_t)80*i63);
__m512 sh48 = _mm512_loadu_ps(shifts8+(ptrdiff_t)16*2+(ptrdiff_t)80*i63);
__m512 sh49 = _mm512_loadu_ps(shifts8+(ptrdiff_t)16*3+(ptrdiff_t)80*i63);
__m512 sh50 = _mm512_loadu_ps(shifts8+(ptrdiff_t)16*4+(ptrdiff_t)80*i63);
__m512 add46 = _mm512_fnmadd_ps(me46, mul46, sh46);
__m512 add47 = _mm512_fnmadd_ps(me47, mul47, sh47);
__m512 add48 = _mm512_fnmadd_ps(me48, mul48, sh48);
__m512 add49 = _mm512_fnmadd_ps(me49, mul49, sh49);
__m512 add50 = _mm512_fnmadd_ps(me50, mul50, sh50);
__m512 lo46 = _mm512_permutex2var_ps(mul46, xlo8, add46);
__m512 lo47 = _mm512_permutex2var_ps(mul47, xlo8, add47);
__m512 lo48 = _mm512_permutex2var_ps(mul48, xlo8, add48);
__m512 lo49 = _mm512_permutex2var_ps(mul49, xlo8, add49);
__m512 lo50 = _mm512_permutex2var_ps(mul50, xlo8, add50);
__m512 hi50 = _mm512_permutex2var_ps(mul46, xhi8, add46);
__m512 hi51 = _mm512_permutex2var_ps(mul47, xhi8, add47);
__m512 hi52 = _mm512_permutex2var_ps(mul48, xhi8, add48);
__m512 hi53 = _mm512_permutex2var_ps(mul49, xhi8, add49);
__m512 hi54 = _mm512_permutex2var_ps(mul50, xhi8, add50);
_mm512_storeu_ps(mas9+(ptrdiff_t)64*0+(ptrdiff_t)640*i63, lo46);
_mm512_storeu_ps(mas9+(ptrdiff_t)64*1+(ptrdiff_t)640*i63, hi50);
_mm512_storeu_ps(mas9+(ptrdiff_t)64*2+(ptrdiff_t)640*i63, lo47);
_mm512_storeu_ps(mas9+(ptrdiff_t)64*3+(ptrdiff_t)640*i63, hi51);
_mm512_storeu_ps(mas9+(ptrdiff_t)64*4+(ptrdiff_t)640*i63, lo48);
_mm512_storeu_ps(mas9+(ptrdiff_t)64*5+(ptrdiff_t)640*i63, hi52);
_mm512_storeu_ps(mas9+(ptrdiff_t)64*6+(ptrdiff_t)640*i63, lo49);
_mm512_storeu_ps(mas9+(ptrdiff_t)64*7+(ptrdiff_t)640*i63, hi53);
_mm512_storeu_ps(mas9+(ptrdiff_t)64*8+(ptrdiff_t)640*i63, lo50);
_mm512_storeu_ps(mas9+(ptrdiff_t)64*9+(ptrdiff_t)640*i63, hi54);
}
__m512 va51 = _mm512_loadu_ps(variances8+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 va52 = _mm512_loadu_ps(variances8+(ptrdiff_t)16*1+(ptrdiff_t)80*3);
__m512 va53 = _mm512_loadu_ps(variances8+(ptrdiff_t)16*2+(ptrdiff_t)80*3);
__m512 rcp51 = DenseNet121Rsqrt1(_mm512_add_ps(eps8, va51));
__m512 rcp52 = DenseNet121Rsqrt1(_mm512_add_ps(eps8, va52));
__m512 rcp53 = DenseNet121Rsqrt1(_mm512_add_ps(eps8, va53));
__m512 sc51 = _mm512_loadu_ps(scales8+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 sc52 = _mm512_loadu_ps(scales8+(ptrdiff_t)16*1+(ptrdiff_t)80*3);
__m512 sc53 = _mm512_loadu_ps(scales8+(ptrdiff_t)16*2+(ptrdiff_t)80*3);
__m512 mul51 = _mm512_mul_ps(rcp51, sc51);
__m512 mul52 = _mm512_mul_ps(rcp52, sc52);
__m512 mul53 = _mm512_mul_ps(rcp53, sc53);
__m512 me51 = _mm512_loadu_ps(means8+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 me52 = _mm512_loadu_ps(means8+(ptrdiff_t)16*1+(ptrdiff_t)80*3);
__m512 me53 = _mm512_loadu_ps(means8+(ptrdiff_t)16*2+(ptrdiff_t)80*3);
__m512 sh51 = _mm512_loadu_ps(shifts8+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 sh52 = _mm512_loadu_ps(shifts8+(ptrdiff_t)16*1+(ptrdiff_t)80*3);
__m512 sh53 = _mm512_loadu_ps(shifts8+(ptrdiff_t)16*2+(ptrdiff_t)80*3);
__m512 add51 = _mm512_fnmadd_ps(me51, mul51, sh51);
__m512 add52 = _mm512_fnmadd_ps(me52, mul52, sh52);
__m512 add53 = _mm512_fnmadd_ps(me53, mul53, sh53);
__m512 lo51 = _mm512_permutex2var_ps(mul51, xlo8, add51);
__m512 lo52 = _mm512_permutex2var_ps(mul52, xlo8, add52);
__m512 lo53 = _mm512_permutex2var_ps(mul53, xlo8, add53);
__m512 hi55 = _mm512_permutex2var_ps(mul51, xhi8, add51);
__m512 hi56 = _mm512_permutex2var_ps(mul52, xhi8, add52);
__m512 hi57 = _mm512_permutex2var_ps(mul53, xhi8, add53);
_mm512_storeu_ps(mas9+(ptrdiff_t)64*0+(ptrdiff_t)640*3, lo51);
_mm512_storeu_ps(mas9+(ptrdiff_t)64*1+(ptrdiff_t)640*3, hi55);
_mm512_storeu_ps(mas9+(ptrdiff_t)64*2+(ptrdiff_t)640*3, lo52);
_mm512_storeu_ps(mas9+(ptrdiff_t)64*3+(ptrdiff_t)640*3, hi56);
_mm512_storeu_ps(mas9+(ptrdiff_t)64*4+(ptrdiff_t)640*3, lo53);
_mm512_storeu_ps(mas9+(ptrdiff_t)64*5+(ptrdiff_t)640*3, hi57);
}

static void DenseNet121BnSimplify9(
float*restrict means9,
float*restrict variances9,
float*restrict scales9,
float*restrict shifts9,
char*restrict mas10
) {
__m512 eps9 = _mm512_set1_ps(1e-05f);
__m512i xlo9 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi9 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i67 = 0; i67 < 4; ++i67) {
__m512 va54 = _mm512_loadu_ps(variances9+(ptrdiff_t)16*0+(ptrdiff_t)80*i67);
__m512 va55 = _mm512_loadu_ps(variances9+(ptrdiff_t)16*1+(ptrdiff_t)80*i67);
__m512 va56 = _mm512_loadu_ps(variances9+(ptrdiff_t)16*2+(ptrdiff_t)80*i67);
__m512 va57 = _mm512_loadu_ps(variances9+(ptrdiff_t)16*3+(ptrdiff_t)80*i67);
__m512 va58 = _mm512_loadu_ps(variances9+(ptrdiff_t)16*4+(ptrdiff_t)80*i67);
__m512 rcp54 = DenseNet121Rsqrt1(_mm512_add_ps(eps9, va54));
__m512 rcp55 = DenseNet121Rsqrt1(_mm512_add_ps(eps9, va55));
__m512 rcp56 = DenseNet121Rsqrt1(_mm512_add_ps(eps9, va56));
__m512 rcp57 = DenseNet121Rsqrt1(_mm512_add_ps(eps9, va57));
__m512 rcp58 = DenseNet121Rsqrt1(_mm512_add_ps(eps9, va58));
__m512 sc54 = _mm512_loadu_ps(scales9+(ptrdiff_t)16*0+(ptrdiff_t)80*i67);
__m512 sc55 = _mm512_loadu_ps(scales9+(ptrdiff_t)16*1+(ptrdiff_t)80*i67);
__m512 sc56 = _mm512_loadu_ps(scales9+(ptrdiff_t)16*2+(ptrdiff_t)80*i67);
__m512 sc57 = _mm512_loadu_ps(scales9+(ptrdiff_t)16*3+(ptrdiff_t)80*i67);
__m512 sc58 = _mm512_loadu_ps(scales9+(ptrdiff_t)16*4+(ptrdiff_t)80*i67);
__m512 mul54 = _mm512_mul_ps(rcp54, sc54);
__m512 mul55 = _mm512_mul_ps(rcp55, sc55);
__m512 mul56 = _mm512_mul_ps(rcp56, sc56);
__m512 mul57 = _mm512_mul_ps(rcp57, sc57);
__m512 mul58 = _mm512_mul_ps(rcp58, sc58);
__m512 me54 = _mm512_loadu_ps(means9+(ptrdiff_t)16*0+(ptrdiff_t)80*i67);
__m512 me55 = _mm512_loadu_ps(means9+(ptrdiff_t)16*1+(ptrdiff_t)80*i67);
__m512 me56 = _mm512_loadu_ps(means9+(ptrdiff_t)16*2+(ptrdiff_t)80*i67);
__m512 me57 = _mm512_loadu_ps(means9+(ptrdiff_t)16*3+(ptrdiff_t)80*i67);
__m512 me58 = _mm512_loadu_ps(means9+(ptrdiff_t)16*4+(ptrdiff_t)80*i67);
__m512 sh54 = _mm512_loadu_ps(shifts9+(ptrdiff_t)16*0+(ptrdiff_t)80*i67);
__m512 sh55 = _mm512_loadu_ps(shifts9+(ptrdiff_t)16*1+(ptrdiff_t)80*i67);
__m512 sh56 = _mm512_loadu_ps(shifts9+(ptrdiff_t)16*2+(ptrdiff_t)80*i67);
__m512 sh57 = _mm512_loadu_ps(shifts9+(ptrdiff_t)16*3+(ptrdiff_t)80*i67);
__m512 sh58 = _mm512_loadu_ps(shifts9+(ptrdiff_t)16*4+(ptrdiff_t)80*i67);
__m512 add54 = _mm512_fnmadd_ps(me54, mul54, sh54);
__m512 add55 = _mm512_fnmadd_ps(me55, mul55, sh55);
__m512 add56 = _mm512_fnmadd_ps(me56, mul56, sh56);
__m512 add57 = _mm512_fnmadd_ps(me57, mul57, sh57);
__m512 add58 = _mm512_fnmadd_ps(me58, mul58, sh58);
__m512 lo54 = _mm512_permutex2var_ps(mul54, xlo9, add54);
__m512 lo55 = _mm512_permutex2var_ps(mul55, xlo9, add55);
__m512 lo56 = _mm512_permutex2var_ps(mul56, xlo9, add56);
__m512 lo57 = _mm512_permutex2var_ps(mul57, xlo9, add57);
__m512 lo58 = _mm512_permutex2var_ps(mul58, xlo9, add58);
__m512 hi58 = _mm512_permutex2var_ps(mul54, xhi9, add54);
__m512 hi59 = _mm512_permutex2var_ps(mul55, xhi9, add55);
__m512 hi60 = _mm512_permutex2var_ps(mul56, xhi9, add56);
__m512 hi61 = _mm512_permutex2var_ps(mul57, xhi9, add57);
__m512 hi62 = _mm512_permutex2var_ps(mul58, xhi9, add58);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*0+(ptrdiff_t)640*i67, lo54);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*1+(ptrdiff_t)640*i67, hi58);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*2+(ptrdiff_t)640*i67, lo55);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*3+(ptrdiff_t)640*i67, hi59);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*4+(ptrdiff_t)640*i67, lo56);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*5+(ptrdiff_t)640*i67, hi60);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*6+(ptrdiff_t)640*i67, lo57);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*7+(ptrdiff_t)640*i67, hi61);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*8+(ptrdiff_t)640*i67, lo58);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*9+(ptrdiff_t)640*i67, hi62);
}
}

static void DenseNet121BnSimplify10(
float*restrict means10,
float*restrict variances10,
float*restrict scales10,
float*restrict shifts10,
char*restrict mas11
) {
__m512 eps10 = _mm512_set1_ps(1e-05f);
__m512i xlo10 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi10 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i71 = 0; i71 < 4; ++i71) {
__m512 va59 = _mm512_loadu_ps(variances10+(ptrdiff_t)16*0+(ptrdiff_t)80*i71);
__m512 va60 = _mm512_loadu_ps(variances10+(ptrdiff_t)16*1+(ptrdiff_t)80*i71);
__m512 va61 = _mm512_loadu_ps(variances10+(ptrdiff_t)16*2+(ptrdiff_t)80*i71);
__m512 va62 = _mm512_loadu_ps(variances10+(ptrdiff_t)16*3+(ptrdiff_t)80*i71);
__m512 va63 = _mm512_loadu_ps(variances10+(ptrdiff_t)16*4+(ptrdiff_t)80*i71);
__m512 rcp59 = DenseNet121Rsqrt1(_mm512_add_ps(eps10, va59));
__m512 rcp60 = DenseNet121Rsqrt1(_mm512_add_ps(eps10, va60));
__m512 rcp61 = DenseNet121Rsqrt1(_mm512_add_ps(eps10, va61));
__m512 rcp62 = DenseNet121Rsqrt1(_mm512_add_ps(eps10, va62));
__m512 rcp63 = DenseNet121Rsqrt1(_mm512_add_ps(eps10, va63));
__m512 sc59 = _mm512_loadu_ps(scales10+(ptrdiff_t)16*0+(ptrdiff_t)80*i71);
__m512 sc60 = _mm512_loadu_ps(scales10+(ptrdiff_t)16*1+(ptrdiff_t)80*i71);
__m512 sc61 = _mm512_loadu_ps(scales10+(ptrdiff_t)16*2+(ptrdiff_t)80*i71);
__m512 sc62 = _mm512_loadu_ps(scales10+(ptrdiff_t)16*3+(ptrdiff_t)80*i71);
__m512 sc63 = _mm512_loadu_ps(scales10+(ptrdiff_t)16*4+(ptrdiff_t)80*i71);
__m512 mul59 = _mm512_mul_ps(rcp59, sc59);
__m512 mul60 = _mm512_mul_ps(rcp60, sc60);
__m512 mul61 = _mm512_mul_ps(rcp61, sc61);
__m512 mul62 = _mm512_mul_ps(rcp62, sc62);
__m512 mul63 = _mm512_mul_ps(rcp63, sc63);
__m512 me59 = _mm512_loadu_ps(means10+(ptrdiff_t)16*0+(ptrdiff_t)80*i71);
__m512 me60 = _mm512_loadu_ps(means10+(ptrdiff_t)16*1+(ptrdiff_t)80*i71);
__m512 me61 = _mm512_loadu_ps(means10+(ptrdiff_t)16*2+(ptrdiff_t)80*i71);
__m512 me62 = _mm512_loadu_ps(means10+(ptrdiff_t)16*3+(ptrdiff_t)80*i71);
__m512 me63 = _mm512_loadu_ps(means10+(ptrdiff_t)16*4+(ptrdiff_t)80*i71);
__m512 sh59 = _mm512_loadu_ps(shifts10+(ptrdiff_t)16*0+(ptrdiff_t)80*i71);
__m512 sh60 = _mm512_loadu_ps(shifts10+(ptrdiff_t)16*1+(ptrdiff_t)80*i71);
__m512 sh61 = _mm512_loadu_ps(shifts10+(ptrdiff_t)16*2+(ptrdiff_t)80*i71);
__m512 sh62 = _mm512_loadu_ps(shifts10+(ptrdiff_t)16*3+(ptrdiff_t)80*i71);
__m512 sh63 = _mm512_loadu_ps(shifts10+(ptrdiff_t)16*4+(ptrdiff_t)80*i71);
__m512 add59 = _mm512_fnmadd_ps(me59, mul59, sh59);
__m512 add60 = _mm512_fnmadd_ps(me60, mul60, sh60);
__m512 add61 = _mm512_fnmadd_ps(me61, mul61, sh61);
__m512 add62 = _mm512_fnmadd_ps(me62, mul62, sh62);
__m512 add63 = _mm512_fnmadd_ps(me63, mul63, sh63);
__m512 lo59 = _mm512_permutex2var_ps(mul59, xlo10, add59);
__m512 lo60 = _mm512_permutex2var_ps(mul60, xlo10, add60);
__m512 lo61 = _mm512_permutex2var_ps(mul61, xlo10, add61);
__m512 lo62 = _mm512_permutex2var_ps(mul62, xlo10, add62);
__m512 lo63 = _mm512_permutex2var_ps(mul63, xlo10, add63);
__m512 hi63 = _mm512_permutex2var_ps(mul59, xhi10, add59);
__m512 hi64 = _mm512_permutex2var_ps(mul60, xhi10, add60);
__m512 hi65 = _mm512_permutex2var_ps(mul61, xhi10, add61);
__m512 hi66 = _mm512_permutex2var_ps(mul62, xhi10, add62);
__m512 hi67 = _mm512_permutex2var_ps(mul63, xhi10, add63);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*0+(ptrdiff_t)640*i71, lo59);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*1+(ptrdiff_t)640*i71, hi63);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*2+(ptrdiff_t)640*i71, lo60);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*3+(ptrdiff_t)640*i71, hi64);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*4+(ptrdiff_t)640*i71, lo61);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*5+(ptrdiff_t)640*i71, hi65);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*6+(ptrdiff_t)640*i71, lo62);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*7+(ptrdiff_t)640*i71, hi66);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*8+(ptrdiff_t)640*i71, lo63);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*9+(ptrdiff_t)640*i71, hi67);
}
__m512 va64 = _mm512_loadu_ps(variances10+(ptrdiff_t)16*0+(ptrdiff_t)80*4);
__m512 va65 = _mm512_loadu_ps(variances10+(ptrdiff_t)16*1+(ptrdiff_t)80*4);
__m512 rcp64 = DenseNet121Rsqrt1(_mm512_add_ps(eps10, va64));
__m512 rcp65 = DenseNet121Rsqrt1(_mm512_add_ps(eps10, va65));
__m512 sc64 = _mm512_loadu_ps(scales10+(ptrdiff_t)16*0+(ptrdiff_t)80*4);
__m512 sc65 = _mm512_loadu_ps(scales10+(ptrdiff_t)16*1+(ptrdiff_t)80*4);
__m512 mul64 = _mm512_mul_ps(rcp64, sc64);
__m512 mul65 = _mm512_mul_ps(rcp65, sc65);
__m512 me64 = _mm512_loadu_ps(means10+(ptrdiff_t)16*0+(ptrdiff_t)80*4);
__m512 me65 = _mm512_loadu_ps(means10+(ptrdiff_t)16*1+(ptrdiff_t)80*4);
__m512 sh64 = _mm512_loadu_ps(shifts10+(ptrdiff_t)16*0+(ptrdiff_t)80*4);
__m512 sh65 = _mm512_loadu_ps(shifts10+(ptrdiff_t)16*1+(ptrdiff_t)80*4);
__m512 add64 = _mm512_fnmadd_ps(me64, mul64, sh64);
__m512 add65 = _mm512_fnmadd_ps(me65, mul65, sh65);
__m512 lo64 = _mm512_permutex2var_ps(mul64, xlo10, add64);
__m512 lo65 = _mm512_permutex2var_ps(mul65, xlo10, add65);
__m512 hi68 = _mm512_permutex2var_ps(mul64, xhi10, add64);
__m512 hi69 = _mm512_permutex2var_ps(mul65, xhi10, add65);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*0+(ptrdiff_t)640*4, lo64);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*1+(ptrdiff_t)640*4, hi68);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*2+(ptrdiff_t)640*4, lo65);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*3+(ptrdiff_t)640*4, hi69);
}

static void DenseNet121BnSimplify11(
float*restrict means11,
float*restrict variances11,
float*restrict scales11,
float*restrict shifts11,
char*restrict mas12
) {
__m512 eps11 = _mm512_set1_ps(1e-05f);
__m512i xlo11 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi11 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i75 = 0; i75 < 4; ++i75) {
__m512 va66 = _mm512_loadu_ps(variances11+(ptrdiff_t)16*0+(ptrdiff_t)80*i75);
__m512 va67 = _mm512_loadu_ps(variances11+(ptrdiff_t)16*1+(ptrdiff_t)80*i75);
__m512 va68 = _mm512_loadu_ps(variances11+(ptrdiff_t)16*2+(ptrdiff_t)80*i75);
__m512 va69 = _mm512_loadu_ps(variances11+(ptrdiff_t)16*3+(ptrdiff_t)80*i75);
__m512 va70 = _mm512_loadu_ps(variances11+(ptrdiff_t)16*4+(ptrdiff_t)80*i75);
__m512 rcp66 = DenseNet121Rsqrt1(_mm512_add_ps(eps11, va66));
__m512 rcp67 = DenseNet121Rsqrt1(_mm512_add_ps(eps11, va67));
__m512 rcp68 = DenseNet121Rsqrt1(_mm512_add_ps(eps11, va68));
__m512 rcp69 = DenseNet121Rsqrt1(_mm512_add_ps(eps11, va69));
__m512 rcp70 = DenseNet121Rsqrt1(_mm512_add_ps(eps11, va70));
__m512 sc66 = _mm512_loadu_ps(scales11+(ptrdiff_t)16*0+(ptrdiff_t)80*i75);
__m512 sc67 = _mm512_loadu_ps(scales11+(ptrdiff_t)16*1+(ptrdiff_t)80*i75);
__m512 sc68 = _mm512_loadu_ps(scales11+(ptrdiff_t)16*2+(ptrdiff_t)80*i75);
__m512 sc69 = _mm512_loadu_ps(scales11+(ptrdiff_t)16*3+(ptrdiff_t)80*i75);
__m512 sc70 = _mm512_loadu_ps(scales11+(ptrdiff_t)16*4+(ptrdiff_t)80*i75);
__m512 mul66 = _mm512_mul_ps(rcp66, sc66);
__m512 mul67 = _mm512_mul_ps(rcp67, sc67);
__m512 mul68 = _mm512_mul_ps(rcp68, sc68);
__m512 mul69 = _mm512_mul_ps(rcp69, sc69);
__m512 mul70 = _mm512_mul_ps(rcp70, sc70);
__m512 me66 = _mm512_loadu_ps(means11+(ptrdiff_t)16*0+(ptrdiff_t)80*i75);
__m512 me67 = _mm512_loadu_ps(means11+(ptrdiff_t)16*1+(ptrdiff_t)80*i75);
__m512 me68 = _mm512_loadu_ps(means11+(ptrdiff_t)16*2+(ptrdiff_t)80*i75);
__m512 me69 = _mm512_loadu_ps(means11+(ptrdiff_t)16*3+(ptrdiff_t)80*i75);
__m512 me70 = _mm512_loadu_ps(means11+(ptrdiff_t)16*4+(ptrdiff_t)80*i75);
__m512 sh66 = _mm512_loadu_ps(shifts11+(ptrdiff_t)16*0+(ptrdiff_t)80*i75);
__m512 sh67 = _mm512_loadu_ps(shifts11+(ptrdiff_t)16*1+(ptrdiff_t)80*i75);
__m512 sh68 = _mm512_loadu_ps(shifts11+(ptrdiff_t)16*2+(ptrdiff_t)80*i75);
__m512 sh69 = _mm512_loadu_ps(shifts11+(ptrdiff_t)16*3+(ptrdiff_t)80*i75);
__m512 sh70 = _mm512_loadu_ps(shifts11+(ptrdiff_t)16*4+(ptrdiff_t)80*i75);
__m512 add66 = _mm512_fnmadd_ps(me66, mul66, sh66);
__m512 add67 = _mm512_fnmadd_ps(me67, mul67, sh67);
__m512 add68 = _mm512_fnmadd_ps(me68, mul68, sh68);
__m512 add69 = _mm512_fnmadd_ps(me69, mul69, sh69);
__m512 add70 = _mm512_fnmadd_ps(me70, mul70, sh70);
__m512 lo66 = _mm512_permutex2var_ps(mul66, xlo11, add66);
__m512 lo67 = _mm512_permutex2var_ps(mul67, xlo11, add67);
__m512 lo68 = _mm512_permutex2var_ps(mul68, xlo11, add68);
__m512 lo69 = _mm512_permutex2var_ps(mul69, xlo11, add69);
__m512 lo70 = _mm512_permutex2var_ps(mul70, xlo11, add70);
__m512 hi70 = _mm512_permutex2var_ps(mul66, xhi11, add66);
__m512 hi71 = _mm512_permutex2var_ps(mul67, xhi11, add67);
__m512 hi72 = _mm512_permutex2var_ps(mul68, xhi11, add68);
__m512 hi73 = _mm512_permutex2var_ps(mul69, xhi11, add69);
__m512 hi74 = _mm512_permutex2var_ps(mul70, xhi11, add70);
_mm512_storeu_ps(mas12+(ptrdiff_t)64*0+(ptrdiff_t)640*i75, lo66);
_mm512_storeu_ps(mas12+(ptrdiff_t)64*1+(ptrdiff_t)640*i75, hi70);
_mm512_storeu_ps(mas12+(ptrdiff_t)64*2+(ptrdiff_t)640*i75, lo67);
_mm512_storeu_ps(mas12+(ptrdiff_t)64*3+(ptrdiff_t)640*i75, hi71);
_mm512_storeu_ps(mas12+(ptrdiff_t)64*4+(ptrdiff_t)640*i75, lo68);
_mm512_storeu_ps(mas12+(ptrdiff_t)64*5+(ptrdiff_t)640*i75, hi72);
_mm512_storeu_ps(mas12+(ptrdiff_t)64*6+(ptrdiff_t)640*i75, lo69);
_mm512_storeu_ps(mas12+(ptrdiff_t)64*7+(ptrdiff_t)640*i75, hi73);
_mm512_storeu_ps(mas12+(ptrdiff_t)64*8+(ptrdiff_t)640*i75, lo70);
_mm512_storeu_ps(mas12+(ptrdiff_t)64*9+(ptrdiff_t)640*i75, hi74);
}
__m512 va71 = _mm512_loadu_ps(variances11+(ptrdiff_t)16*0+(ptrdiff_t)80*4);
__m512 va72 = _mm512_loadu_ps(variances11+(ptrdiff_t)16*1+(ptrdiff_t)80*4);
__m512 va73 = _mm512_loadu_ps(variances11+(ptrdiff_t)16*2+(ptrdiff_t)80*4);
__m512 va74 = _mm512_loadu_ps(variances11+(ptrdiff_t)16*3+(ptrdiff_t)80*4);
__m512 rcp71 = DenseNet121Rsqrt1(_mm512_add_ps(eps11, va71));
__m512 rcp72 = DenseNet121Rsqrt1(_mm512_add_ps(eps11, va72));
__m512 rcp73 = DenseNet121Rsqrt1(_mm512_add_ps(eps11, va73));
__m512 rcp74 = DenseNet121Rsqrt1(_mm512_add_ps(eps11, va74));
__m512 sc71 = _mm512_loadu_ps(scales11+(ptrdiff_t)16*0+(ptrdiff_t)80*4);
__m512 sc72 = _mm512_loadu_ps(scales11+(ptrdiff_t)16*1+(ptrdiff_t)80*4);
__m512 sc73 = _mm512_loadu_ps(scales11+(ptrdiff_t)16*2+(ptrdiff_t)80*4);
__m512 sc74 = _mm512_loadu_ps(scales11+(ptrdiff_t)16*3+(ptrdiff_t)80*4);
__m512 mul71 = _mm512_mul_ps(rcp71, sc71);
__m512 mul72 = _mm512_mul_ps(rcp72, sc72);
__m512 mul73 = _mm512_mul_ps(rcp73, sc73);
__m512 mul74 = _mm512_mul_ps(rcp74, sc74);
__m512 me71 = _mm512_loadu_ps(means11+(ptrdiff_t)16*0+(ptrdiff_t)80*4);
__m512 me72 = _mm512_loadu_ps(means11+(ptrdiff_t)16*1+(ptrdiff_t)80*4);
__m512 me73 = _mm512_loadu_ps(means11+(ptrdiff_t)16*2+(ptrdiff_t)80*4);
__m512 me74 = _mm512_loadu_ps(means11+(ptrdiff_t)16*3+(ptrdiff_t)80*4);
__m512 sh71 = _mm512_loadu_ps(shifts11+(ptrdiff_t)16*0+(ptrdiff_t)80*4);
__m512 sh72 = _mm512_loadu_ps(shifts11+(ptrdiff_t)16*1+(ptrdiff_t)80*4);
__m512 sh73 = _mm512_loadu_ps(shifts11+(ptrdiff_t)16*2+(ptrdiff_t)80*4);
__m512 sh74 = _mm512_loadu_ps(shifts11+(ptrdiff_t)16*3+(ptrdiff_t)80*4);
__m512 add71 = _mm512_fnmadd_ps(me71, mul71, sh71);
__m512 add72 = _mm512_fnmadd_ps(me72, mul72, sh72);
__m512 add73 = _mm512_fnmadd_ps(me73, mul73, sh73);
__m512 add74 = _mm512_fnmadd_ps(me74, mul74, sh74);
__m512 lo71 = _mm512_permutex2var_ps(mul71, xlo11, add71);
__m512 lo72 = _mm512_permutex2var_ps(mul72, xlo11, add72);
__m512 lo73 = _mm512_permutex2var_ps(mul73, xlo11, add73);
__m512 lo74 = _mm512_permutex2var_ps(mul74, xlo11, add74);
__m512 hi75 = _mm512_permutex2var_ps(mul71, xhi11, add71);
__m512 hi76 = _mm512_permutex2var_ps(mul72, xhi11, add72);
__m512 hi77 = _mm512_permutex2var_ps(mul73, xhi11, add73);
__m512 hi78 = _mm512_permutex2var_ps(mul74, xhi11, add74);
_mm512_storeu_ps(mas12+(ptrdiff_t)64*0+(ptrdiff_t)640*4, lo71);
_mm512_storeu_ps(mas12+(ptrdiff_t)64*1+(ptrdiff_t)640*4, hi75);
_mm512_storeu_ps(mas12+(ptrdiff_t)64*2+(ptrdiff_t)640*4, lo72);
_mm512_storeu_ps(mas12+(ptrdiff_t)64*3+(ptrdiff_t)640*4, hi76);
_mm512_storeu_ps(mas12+(ptrdiff_t)64*4+(ptrdiff_t)640*4, lo73);
_mm512_storeu_ps(mas12+(ptrdiff_t)64*5+(ptrdiff_t)640*4, hi77);
_mm512_storeu_ps(mas12+(ptrdiff_t)64*6+(ptrdiff_t)640*4, lo74);
_mm512_storeu_ps(mas12+(ptrdiff_t)64*7+(ptrdiff_t)640*4, hi78);
}

static void DenseNet121BnSimplify12(
float*restrict means12,
float*restrict variances12,
float*restrict scales12,
float*restrict shifts12,
char*restrict mas13
) {
__m512 eps12 = _mm512_set1_ps(1e-05f);
__m512i xlo12 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi12 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i79 = 0; i79 < 5; ++i79) {
__m512 va75 = _mm512_loadu_ps(variances12+(ptrdiff_t)16*0+(ptrdiff_t)80*i79);
__m512 va76 = _mm512_loadu_ps(variances12+(ptrdiff_t)16*1+(ptrdiff_t)80*i79);
__m512 va77 = _mm512_loadu_ps(variances12+(ptrdiff_t)16*2+(ptrdiff_t)80*i79);
__m512 va78 = _mm512_loadu_ps(variances12+(ptrdiff_t)16*3+(ptrdiff_t)80*i79);
__m512 va79 = _mm512_loadu_ps(variances12+(ptrdiff_t)16*4+(ptrdiff_t)80*i79);
__m512 rcp75 = DenseNet121Rsqrt1(_mm512_add_ps(eps12, va75));
__m512 rcp76 = DenseNet121Rsqrt1(_mm512_add_ps(eps12, va76));
__m512 rcp77 = DenseNet121Rsqrt1(_mm512_add_ps(eps12, va77));
__m512 rcp78 = DenseNet121Rsqrt1(_mm512_add_ps(eps12, va78));
__m512 rcp79 = DenseNet121Rsqrt1(_mm512_add_ps(eps12, va79));
__m512 sc75 = _mm512_loadu_ps(scales12+(ptrdiff_t)16*0+(ptrdiff_t)80*i79);
__m512 sc76 = _mm512_loadu_ps(scales12+(ptrdiff_t)16*1+(ptrdiff_t)80*i79);
__m512 sc77 = _mm512_loadu_ps(scales12+(ptrdiff_t)16*2+(ptrdiff_t)80*i79);
__m512 sc78 = _mm512_loadu_ps(scales12+(ptrdiff_t)16*3+(ptrdiff_t)80*i79);
__m512 sc79 = _mm512_loadu_ps(scales12+(ptrdiff_t)16*4+(ptrdiff_t)80*i79);
__m512 mul75 = _mm512_mul_ps(rcp75, sc75);
__m512 mul76 = _mm512_mul_ps(rcp76, sc76);
__m512 mul77 = _mm512_mul_ps(rcp77, sc77);
__m512 mul78 = _mm512_mul_ps(rcp78, sc78);
__m512 mul79 = _mm512_mul_ps(rcp79, sc79);
__m512 me75 = _mm512_loadu_ps(means12+(ptrdiff_t)16*0+(ptrdiff_t)80*i79);
__m512 me76 = _mm512_loadu_ps(means12+(ptrdiff_t)16*1+(ptrdiff_t)80*i79);
__m512 me77 = _mm512_loadu_ps(means12+(ptrdiff_t)16*2+(ptrdiff_t)80*i79);
__m512 me78 = _mm512_loadu_ps(means12+(ptrdiff_t)16*3+(ptrdiff_t)80*i79);
__m512 me79 = _mm512_loadu_ps(means12+(ptrdiff_t)16*4+(ptrdiff_t)80*i79);
__m512 sh75 = _mm512_loadu_ps(shifts12+(ptrdiff_t)16*0+(ptrdiff_t)80*i79);
__m512 sh76 = _mm512_loadu_ps(shifts12+(ptrdiff_t)16*1+(ptrdiff_t)80*i79);
__m512 sh77 = _mm512_loadu_ps(shifts12+(ptrdiff_t)16*2+(ptrdiff_t)80*i79);
__m512 sh78 = _mm512_loadu_ps(shifts12+(ptrdiff_t)16*3+(ptrdiff_t)80*i79);
__m512 sh79 = _mm512_loadu_ps(shifts12+(ptrdiff_t)16*4+(ptrdiff_t)80*i79);
__m512 add75 = _mm512_fnmadd_ps(me75, mul75, sh75);
__m512 add76 = _mm512_fnmadd_ps(me76, mul76, sh76);
__m512 add77 = _mm512_fnmadd_ps(me77, mul77, sh77);
__m512 add78 = _mm512_fnmadd_ps(me78, mul78, sh78);
__m512 add79 = _mm512_fnmadd_ps(me79, mul79, sh79);
__m512 lo75 = _mm512_permutex2var_ps(mul75, xlo12, add75);
__m512 lo76 = _mm512_permutex2var_ps(mul76, xlo12, add76);
__m512 lo77 = _mm512_permutex2var_ps(mul77, xlo12, add77);
__m512 lo78 = _mm512_permutex2var_ps(mul78, xlo12, add78);
__m512 lo79 = _mm512_permutex2var_ps(mul79, xlo12, add79);
__m512 hi79 = _mm512_permutex2var_ps(mul75, xhi12, add75);
__m512 hi80 = _mm512_permutex2var_ps(mul76, xhi12, add76);
__m512 hi81 = _mm512_permutex2var_ps(mul77, xhi12, add77);
__m512 hi82 = _mm512_permutex2var_ps(mul78, xhi12, add78);
__m512 hi83 = _mm512_permutex2var_ps(mul79, xhi12, add79);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*0+(ptrdiff_t)640*i79, lo75);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*1+(ptrdiff_t)640*i79, hi79);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*2+(ptrdiff_t)640*i79, lo76);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*3+(ptrdiff_t)640*i79, hi80);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*4+(ptrdiff_t)640*i79, lo77);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*5+(ptrdiff_t)640*i79, hi81);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*6+(ptrdiff_t)640*i79, lo78);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*7+(ptrdiff_t)640*i79, hi82);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*8+(ptrdiff_t)640*i79, lo79);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*9+(ptrdiff_t)640*i79, hi83);
}
__m512 va80 = _mm512_loadu_ps(variances12+(ptrdiff_t)16*0+(ptrdiff_t)80*5);
__m512 rcp80 = DenseNet121Rsqrt1(_mm512_add_ps(eps12, va80));
__m512 sc80 = _mm512_loadu_ps(scales12+(ptrdiff_t)16*0+(ptrdiff_t)80*5);
__m512 mul80 = _mm512_mul_ps(rcp80, sc80);
__m512 me80 = _mm512_loadu_ps(means12+(ptrdiff_t)16*0+(ptrdiff_t)80*5);
__m512 sh80 = _mm512_loadu_ps(shifts12+(ptrdiff_t)16*0+(ptrdiff_t)80*5);
__m512 add80 = _mm512_fnmadd_ps(me80, mul80, sh80);
__m512 lo80 = _mm512_permutex2var_ps(mul80, xlo12, add80);
__m512 hi84 = _mm512_permutex2var_ps(mul80, xhi12, add80);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*0+(ptrdiff_t)640*5, lo80);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*1+(ptrdiff_t)640*5, hi84);
}

static void DenseNet121BnSimplify13(
float*restrict means13,
float*restrict variances13,
float*restrict scales13,
float*restrict shifts13,
char*restrict mas14
) {
__m512 eps13 = _mm512_set1_ps(1e-05f);
__m512i xlo13 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi13 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i83 = 0; i83 < 5; ++i83) {
__m512 va81 = _mm512_loadu_ps(variances13+(ptrdiff_t)16*0+(ptrdiff_t)80*i83);
__m512 va82 = _mm512_loadu_ps(variances13+(ptrdiff_t)16*1+(ptrdiff_t)80*i83);
__m512 va83 = _mm512_loadu_ps(variances13+(ptrdiff_t)16*2+(ptrdiff_t)80*i83);
__m512 va84 = _mm512_loadu_ps(variances13+(ptrdiff_t)16*3+(ptrdiff_t)80*i83);
__m512 va85 = _mm512_loadu_ps(variances13+(ptrdiff_t)16*4+(ptrdiff_t)80*i83);
__m512 rcp81 = DenseNet121Rsqrt1(_mm512_add_ps(eps13, va81));
__m512 rcp82 = DenseNet121Rsqrt1(_mm512_add_ps(eps13, va82));
__m512 rcp83 = DenseNet121Rsqrt1(_mm512_add_ps(eps13, va83));
__m512 rcp84 = DenseNet121Rsqrt1(_mm512_add_ps(eps13, va84));
__m512 rcp85 = DenseNet121Rsqrt1(_mm512_add_ps(eps13, va85));
__m512 sc81 = _mm512_loadu_ps(scales13+(ptrdiff_t)16*0+(ptrdiff_t)80*i83);
__m512 sc82 = _mm512_loadu_ps(scales13+(ptrdiff_t)16*1+(ptrdiff_t)80*i83);
__m512 sc83 = _mm512_loadu_ps(scales13+(ptrdiff_t)16*2+(ptrdiff_t)80*i83);
__m512 sc84 = _mm512_loadu_ps(scales13+(ptrdiff_t)16*3+(ptrdiff_t)80*i83);
__m512 sc85 = _mm512_loadu_ps(scales13+(ptrdiff_t)16*4+(ptrdiff_t)80*i83);
__m512 mul81 = _mm512_mul_ps(rcp81, sc81);
__m512 mul82 = _mm512_mul_ps(rcp82, sc82);
__m512 mul83 = _mm512_mul_ps(rcp83, sc83);
__m512 mul84 = _mm512_mul_ps(rcp84, sc84);
__m512 mul85 = _mm512_mul_ps(rcp85, sc85);
__m512 me81 = _mm512_loadu_ps(means13+(ptrdiff_t)16*0+(ptrdiff_t)80*i83);
__m512 me82 = _mm512_loadu_ps(means13+(ptrdiff_t)16*1+(ptrdiff_t)80*i83);
__m512 me83 = _mm512_loadu_ps(means13+(ptrdiff_t)16*2+(ptrdiff_t)80*i83);
__m512 me84 = _mm512_loadu_ps(means13+(ptrdiff_t)16*3+(ptrdiff_t)80*i83);
__m512 me85 = _mm512_loadu_ps(means13+(ptrdiff_t)16*4+(ptrdiff_t)80*i83);
__m512 sh81 = _mm512_loadu_ps(shifts13+(ptrdiff_t)16*0+(ptrdiff_t)80*i83);
__m512 sh82 = _mm512_loadu_ps(shifts13+(ptrdiff_t)16*1+(ptrdiff_t)80*i83);
__m512 sh83 = _mm512_loadu_ps(shifts13+(ptrdiff_t)16*2+(ptrdiff_t)80*i83);
__m512 sh84 = _mm512_loadu_ps(shifts13+(ptrdiff_t)16*3+(ptrdiff_t)80*i83);
__m512 sh85 = _mm512_loadu_ps(shifts13+(ptrdiff_t)16*4+(ptrdiff_t)80*i83);
__m512 add81 = _mm512_fnmadd_ps(me81, mul81, sh81);
__m512 add82 = _mm512_fnmadd_ps(me82, mul82, sh82);
__m512 add83 = _mm512_fnmadd_ps(me83, mul83, sh83);
__m512 add84 = _mm512_fnmadd_ps(me84, mul84, sh84);
__m512 add85 = _mm512_fnmadd_ps(me85, mul85, sh85);
__m512 lo81 = _mm512_permutex2var_ps(mul81, xlo13, add81);
__m512 lo82 = _mm512_permutex2var_ps(mul82, xlo13, add82);
__m512 lo83 = _mm512_permutex2var_ps(mul83, xlo13, add83);
__m512 lo84 = _mm512_permutex2var_ps(mul84, xlo13, add84);
__m512 lo85 = _mm512_permutex2var_ps(mul85, xlo13, add85);
__m512 hi85 = _mm512_permutex2var_ps(mul81, xhi13, add81);
__m512 hi86 = _mm512_permutex2var_ps(mul82, xhi13, add82);
__m512 hi87 = _mm512_permutex2var_ps(mul83, xhi13, add83);
__m512 hi88 = _mm512_permutex2var_ps(mul84, xhi13, add84);
__m512 hi89 = _mm512_permutex2var_ps(mul85, xhi13, add85);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*0+(ptrdiff_t)640*i83, lo81);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*1+(ptrdiff_t)640*i83, hi85);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*2+(ptrdiff_t)640*i83, lo82);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*3+(ptrdiff_t)640*i83, hi86);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*4+(ptrdiff_t)640*i83, lo83);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*5+(ptrdiff_t)640*i83, hi87);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*6+(ptrdiff_t)640*i83, lo84);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*7+(ptrdiff_t)640*i83, hi88);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*8+(ptrdiff_t)640*i83, lo85);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*9+(ptrdiff_t)640*i83, hi89);
}
__m512 va86 = _mm512_loadu_ps(variances13+(ptrdiff_t)16*0+(ptrdiff_t)80*5);
__m512 va87 = _mm512_loadu_ps(variances13+(ptrdiff_t)16*1+(ptrdiff_t)80*5);
__m512 va88 = _mm512_loadu_ps(variances13+(ptrdiff_t)16*2+(ptrdiff_t)80*5);
__m512 rcp86 = DenseNet121Rsqrt1(_mm512_add_ps(eps13, va86));
__m512 rcp87 = DenseNet121Rsqrt1(_mm512_add_ps(eps13, va87));
__m512 rcp88 = DenseNet121Rsqrt1(_mm512_add_ps(eps13, va88));
__m512 sc86 = _mm512_loadu_ps(scales13+(ptrdiff_t)16*0+(ptrdiff_t)80*5);
__m512 sc87 = _mm512_loadu_ps(scales13+(ptrdiff_t)16*1+(ptrdiff_t)80*5);
__m512 sc88 = _mm512_loadu_ps(scales13+(ptrdiff_t)16*2+(ptrdiff_t)80*5);
__m512 mul86 = _mm512_mul_ps(rcp86, sc86);
__m512 mul87 = _mm512_mul_ps(rcp87, sc87);
__m512 mul88 = _mm512_mul_ps(rcp88, sc88);
__m512 me86 = _mm512_loadu_ps(means13+(ptrdiff_t)16*0+(ptrdiff_t)80*5);
__m512 me87 = _mm512_loadu_ps(means13+(ptrdiff_t)16*1+(ptrdiff_t)80*5);
__m512 me88 = _mm512_loadu_ps(means13+(ptrdiff_t)16*2+(ptrdiff_t)80*5);
__m512 sh86 = _mm512_loadu_ps(shifts13+(ptrdiff_t)16*0+(ptrdiff_t)80*5);
__m512 sh87 = _mm512_loadu_ps(shifts13+(ptrdiff_t)16*1+(ptrdiff_t)80*5);
__m512 sh88 = _mm512_loadu_ps(shifts13+(ptrdiff_t)16*2+(ptrdiff_t)80*5);
__m512 add86 = _mm512_fnmadd_ps(me86, mul86, sh86);
__m512 add87 = _mm512_fnmadd_ps(me87, mul87, sh87);
__m512 add88 = _mm512_fnmadd_ps(me88, mul88, sh88);
__m512 lo86 = _mm512_permutex2var_ps(mul86, xlo13, add86);
__m512 lo87 = _mm512_permutex2var_ps(mul87, xlo13, add87);
__m512 lo88 = _mm512_permutex2var_ps(mul88, xlo13, add88);
__m512 hi90 = _mm512_permutex2var_ps(mul86, xhi13, add86);
__m512 hi91 = _mm512_permutex2var_ps(mul87, xhi13, add87);
__m512 hi92 = _mm512_permutex2var_ps(mul88, xhi13, add88);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*0+(ptrdiff_t)640*5, lo86);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*1+(ptrdiff_t)640*5, hi90);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*2+(ptrdiff_t)640*5, lo87);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*3+(ptrdiff_t)640*5, hi91);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*4+(ptrdiff_t)640*5, lo88);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*5+(ptrdiff_t)640*5, hi92);
}

static void DenseNet121BnSimplify14(
float*restrict means14,
float*restrict variances14,
float*restrict scales14,
float*restrict shifts14,
char*restrict mas15
) {
__m512 eps14 = _mm512_set1_ps(1e-05f);
__m512i xlo14 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi14 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i87 = 0; i87 < 6; ++i87) {
__m512 va89 = _mm512_loadu_ps(variances14+(ptrdiff_t)16*0+(ptrdiff_t)80*i87);
__m512 va90 = _mm512_loadu_ps(variances14+(ptrdiff_t)16*1+(ptrdiff_t)80*i87);
__m512 va91 = _mm512_loadu_ps(variances14+(ptrdiff_t)16*2+(ptrdiff_t)80*i87);
__m512 va92 = _mm512_loadu_ps(variances14+(ptrdiff_t)16*3+(ptrdiff_t)80*i87);
__m512 va93 = _mm512_loadu_ps(variances14+(ptrdiff_t)16*4+(ptrdiff_t)80*i87);
__m512 rcp89 = DenseNet121Rsqrt1(_mm512_add_ps(eps14, va89));
__m512 rcp90 = DenseNet121Rsqrt1(_mm512_add_ps(eps14, va90));
__m512 rcp91 = DenseNet121Rsqrt1(_mm512_add_ps(eps14, va91));
__m512 rcp92 = DenseNet121Rsqrt1(_mm512_add_ps(eps14, va92));
__m512 rcp93 = DenseNet121Rsqrt1(_mm512_add_ps(eps14, va93));
__m512 sc89 = _mm512_loadu_ps(scales14+(ptrdiff_t)16*0+(ptrdiff_t)80*i87);
__m512 sc90 = _mm512_loadu_ps(scales14+(ptrdiff_t)16*1+(ptrdiff_t)80*i87);
__m512 sc91 = _mm512_loadu_ps(scales14+(ptrdiff_t)16*2+(ptrdiff_t)80*i87);
__m512 sc92 = _mm512_loadu_ps(scales14+(ptrdiff_t)16*3+(ptrdiff_t)80*i87);
__m512 sc93 = _mm512_loadu_ps(scales14+(ptrdiff_t)16*4+(ptrdiff_t)80*i87);
__m512 mul89 = _mm512_mul_ps(rcp89, sc89);
__m512 mul90 = _mm512_mul_ps(rcp90, sc90);
__m512 mul91 = _mm512_mul_ps(rcp91, sc91);
__m512 mul92 = _mm512_mul_ps(rcp92, sc92);
__m512 mul93 = _mm512_mul_ps(rcp93, sc93);
__m512 me89 = _mm512_loadu_ps(means14+(ptrdiff_t)16*0+(ptrdiff_t)80*i87);
__m512 me90 = _mm512_loadu_ps(means14+(ptrdiff_t)16*1+(ptrdiff_t)80*i87);
__m512 me91 = _mm512_loadu_ps(means14+(ptrdiff_t)16*2+(ptrdiff_t)80*i87);
__m512 me92 = _mm512_loadu_ps(means14+(ptrdiff_t)16*3+(ptrdiff_t)80*i87);
__m512 me93 = _mm512_loadu_ps(means14+(ptrdiff_t)16*4+(ptrdiff_t)80*i87);
__m512 sh89 = _mm512_loadu_ps(shifts14+(ptrdiff_t)16*0+(ptrdiff_t)80*i87);
__m512 sh90 = _mm512_loadu_ps(shifts14+(ptrdiff_t)16*1+(ptrdiff_t)80*i87);
__m512 sh91 = _mm512_loadu_ps(shifts14+(ptrdiff_t)16*2+(ptrdiff_t)80*i87);
__m512 sh92 = _mm512_loadu_ps(shifts14+(ptrdiff_t)16*3+(ptrdiff_t)80*i87);
__m512 sh93 = _mm512_loadu_ps(shifts14+(ptrdiff_t)16*4+(ptrdiff_t)80*i87);
__m512 add89 = _mm512_fnmadd_ps(me89, mul89, sh89);
__m512 add90 = _mm512_fnmadd_ps(me90, mul90, sh90);
__m512 add91 = _mm512_fnmadd_ps(me91, mul91, sh91);
__m512 add92 = _mm512_fnmadd_ps(me92, mul92, sh92);
__m512 add93 = _mm512_fnmadd_ps(me93, mul93, sh93);
__m512 lo89 = _mm512_permutex2var_ps(mul89, xlo14, add89);
__m512 lo90 = _mm512_permutex2var_ps(mul90, xlo14, add90);
__m512 lo91 = _mm512_permutex2var_ps(mul91, xlo14, add91);
__m512 lo92 = _mm512_permutex2var_ps(mul92, xlo14, add92);
__m512 lo93 = _mm512_permutex2var_ps(mul93, xlo14, add93);
__m512 hi93 = _mm512_permutex2var_ps(mul89, xhi14, add89);
__m512 hi94 = _mm512_permutex2var_ps(mul90, xhi14, add90);
__m512 hi95 = _mm512_permutex2var_ps(mul91, xhi14, add91);
__m512 hi96 = _mm512_permutex2var_ps(mul92, xhi14, add92);
__m512 hi97 = _mm512_permutex2var_ps(mul93, xhi14, add93);
_mm512_storeu_ps(mas15+(ptrdiff_t)64*0+(ptrdiff_t)640*i87, lo89);
_mm512_storeu_ps(mas15+(ptrdiff_t)64*1+(ptrdiff_t)640*i87, hi93);
_mm512_storeu_ps(mas15+(ptrdiff_t)64*2+(ptrdiff_t)640*i87, lo90);
_mm512_storeu_ps(mas15+(ptrdiff_t)64*3+(ptrdiff_t)640*i87, hi94);
_mm512_storeu_ps(mas15+(ptrdiff_t)64*4+(ptrdiff_t)640*i87, lo91);
_mm512_storeu_ps(mas15+(ptrdiff_t)64*5+(ptrdiff_t)640*i87, hi95);
_mm512_storeu_ps(mas15+(ptrdiff_t)64*6+(ptrdiff_t)640*i87, lo92);
_mm512_storeu_ps(mas15+(ptrdiff_t)64*7+(ptrdiff_t)640*i87, hi96);
_mm512_storeu_ps(mas15+(ptrdiff_t)64*8+(ptrdiff_t)640*i87, lo93);
_mm512_storeu_ps(mas15+(ptrdiff_t)64*9+(ptrdiff_t)640*i87, hi97);
}
}

static void DenseNet121BnSimplify15(
float*restrict means15,
float*restrict variances15,
float*restrict scales15,
float*restrict shifts15,
char*restrict mas16
) {
__m512 eps15 = _mm512_set1_ps(1e-05f);
__m512i xlo15 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi15 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i91 = 0; i91 < 6; ++i91) {
__m512 va94 = _mm512_loadu_ps(variances15+(ptrdiff_t)16*0+(ptrdiff_t)80*i91);
__m512 va95 = _mm512_loadu_ps(variances15+(ptrdiff_t)16*1+(ptrdiff_t)80*i91);
__m512 va96 = _mm512_loadu_ps(variances15+(ptrdiff_t)16*2+(ptrdiff_t)80*i91);
__m512 va97 = _mm512_loadu_ps(variances15+(ptrdiff_t)16*3+(ptrdiff_t)80*i91);
__m512 va98 = _mm512_loadu_ps(variances15+(ptrdiff_t)16*4+(ptrdiff_t)80*i91);
__m512 rcp94 = DenseNet121Rsqrt1(_mm512_add_ps(eps15, va94));
__m512 rcp95 = DenseNet121Rsqrt1(_mm512_add_ps(eps15, va95));
__m512 rcp96 = DenseNet121Rsqrt1(_mm512_add_ps(eps15, va96));
__m512 rcp97 = DenseNet121Rsqrt1(_mm512_add_ps(eps15, va97));
__m512 rcp98 = DenseNet121Rsqrt1(_mm512_add_ps(eps15, va98));
__m512 sc94 = _mm512_loadu_ps(scales15+(ptrdiff_t)16*0+(ptrdiff_t)80*i91);
__m512 sc95 = _mm512_loadu_ps(scales15+(ptrdiff_t)16*1+(ptrdiff_t)80*i91);
__m512 sc96 = _mm512_loadu_ps(scales15+(ptrdiff_t)16*2+(ptrdiff_t)80*i91);
__m512 sc97 = _mm512_loadu_ps(scales15+(ptrdiff_t)16*3+(ptrdiff_t)80*i91);
__m512 sc98 = _mm512_loadu_ps(scales15+(ptrdiff_t)16*4+(ptrdiff_t)80*i91);
__m512 mul94 = _mm512_mul_ps(rcp94, sc94);
__m512 mul95 = _mm512_mul_ps(rcp95, sc95);
__m512 mul96 = _mm512_mul_ps(rcp96, sc96);
__m512 mul97 = _mm512_mul_ps(rcp97, sc97);
__m512 mul98 = _mm512_mul_ps(rcp98, sc98);
__m512 me94 = _mm512_loadu_ps(means15+(ptrdiff_t)16*0+(ptrdiff_t)80*i91);
__m512 me95 = _mm512_loadu_ps(means15+(ptrdiff_t)16*1+(ptrdiff_t)80*i91);
__m512 me96 = _mm512_loadu_ps(means15+(ptrdiff_t)16*2+(ptrdiff_t)80*i91);
__m512 me97 = _mm512_loadu_ps(means15+(ptrdiff_t)16*3+(ptrdiff_t)80*i91);
__m512 me98 = _mm512_loadu_ps(means15+(ptrdiff_t)16*4+(ptrdiff_t)80*i91);
__m512 sh94 = _mm512_loadu_ps(shifts15+(ptrdiff_t)16*0+(ptrdiff_t)80*i91);
__m512 sh95 = _mm512_loadu_ps(shifts15+(ptrdiff_t)16*1+(ptrdiff_t)80*i91);
__m512 sh96 = _mm512_loadu_ps(shifts15+(ptrdiff_t)16*2+(ptrdiff_t)80*i91);
__m512 sh97 = _mm512_loadu_ps(shifts15+(ptrdiff_t)16*3+(ptrdiff_t)80*i91);
__m512 sh98 = _mm512_loadu_ps(shifts15+(ptrdiff_t)16*4+(ptrdiff_t)80*i91);
__m512 add94 = _mm512_fnmadd_ps(me94, mul94, sh94);
__m512 add95 = _mm512_fnmadd_ps(me95, mul95, sh95);
__m512 add96 = _mm512_fnmadd_ps(me96, mul96, sh96);
__m512 add97 = _mm512_fnmadd_ps(me97, mul97, sh97);
__m512 add98 = _mm512_fnmadd_ps(me98, mul98, sh98);
__m512 lo94 = _mm512_permutex2var_ps(mul94, xlo15, add94);
__m512 lo95 = _mm512_permutex2var_ps(mul95, xlo15, add95);
__m512 lo96 = _mm512_permutex2var_ps(mul96, xlo15, add96);
__m512 lo97 = _mm512_permutex2var_ps(mul97, xlo15, add97);
__m512 lo98 = _mm512_permutex2var_ps(mul98, xlo15, add98);
__m512 hi98 = _mm512_permutex2var_ps(mul94, xhi15, add94);
__m512 hi99 = _mm512_permutex2var_ps(mul95, xhi15, add95);
__m512 hi100 = _mm512_permutex2var_ps(mul96, xhi15, add96);
__m512 hi101 = _mm512_permutex2var_ps(mul97, xhi15, add97);
__m512 hi102 = _mm512_permutex2var_ps(mul98, xhi15, add98);
_mm512_storeu_ps(mas16+(ptrdiff_t)64*0+(ptrdiff_t)640*i91, lo94);
_mm512_storeu_ps(mas16+(ptrdiff_t)64*1+(ptrdiff_t)640*i91, hi98);
_mm512_storeu_ps(mas16+(ptrdiff_t)64*2+(ptrdiff_t)640*i91, lo95);
_mm512_storeu_ps(mas16+(ptrdiff_t)64*3+(ptrdiff_t)640*i91, hi99);
_mm512_storeu_ps(mas16+(ptrdiff_t)64*4+(ptrdiff_t)640*i91, lo96);
_mm512_storeu_ps(mas16+(ptrdiff_t)64*5+(ptrdiff_t)640*i91, hi100);
_mm512_storeu_ps(mas16+(ptrdiff_t)64*6+(ptrdiff_t)640*i91, lo97);
_mm512_storeu_ps(mas16+(ptrdiff_t)64*7+(ptrdiff_t)640*i91, hi101);
_mm512_storeu_ps(mas16+(ptrdiff_t)64*8+(ptrdiff_t)640*i91, lo98);
_mm512_storeu_ps(mas16+(ptrdiff_t)64*9+(ptrdiff_t)640*i91, hi102);
}
__m512 va99 = _mm512_loadu_ps(variances15+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 va100 = _mm512_loadu_ps(variances15+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 rcp99 = DenseNet121Rsqrt1(_mm512_add_ps(eps15, va99));
__m512 rcp100 = DenseNet121Rsqrt1(_mm512_add_ps(eps15, va100));
__m512 sc99 = _mm512_loadu_ps(scales15+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 sc100 = _mm512_loadu_ps(scales15+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 mul99 = _mm512_mul_ps(rcp99, sc99);
__m512 mul100 = _mm512_mul_ps(rcp100, sc100);
__m512 me99 = _mm512_loadu_ps(means15+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 me100 = _mm512_loadu_ps(means15+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 sh99 = _mm512_loadu_ps(shifts15+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 sh100 = _mm512_loadu_ps(shifts15+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 add99 = _mm512_fnmadd_ps(me99, mul99, sh99);
__m512 add100 = _mm512_fnmadd_ps(me100, mul100, sh100);
__m512 lo99 = _mm512_permutex2var_ps(mul99, xlo15, add99);
__m512 lo100 = _mm512_permutex2var_ps(mul100, xlo15, add100);
__m512 hi103 = _mm512_permutex2var_ps(mul99, xhi15, add99);
__m512 hi104 = _mm512_permutex2var_ps(mul100, xhi15, add100);
_mm512_storeu_ps(mas16+(ptrdiff_t)64*0+(ptrdiff_t)640*6, lo99);
_mm512_storeu_ps(mas16+(ptrdiff_t)64*1+(ptrdiff_t)640*6, hi103);
_mm512_storeu_ps(mas16+(ptrdiff_t)64*2+(ptrdiff_t)640*6, lo100);
_mm512_storeu_ps(mas16+(ptrdiff_t)64*3+(ptrdiff_t)640*6, hi104);
}

static void DenseNet121BnSimplify16(
float*restrict means16,
float*restrict variances16,
float*restrict scales16,
float*restrict shifts16,
char*restrict mas17
) {
__m512 eps16 = _mm512_set1_ps(1e-05f);
__m512i xlo16 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi16 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i128 = 0; i128 < 6; ++i128) {
__m512 va101 = _mm512_loadu_ps(variances16+(ptrdiff_t)16*0+(ptrdiff_t)80*i128);
__m512 va102 = _mm512_loadu_ps(variances16+(ptrdiff_t)16*1+(ptrdiff_t)80*i128);
__m512 va103 = _mm512_loadu_ps(variances16+(ptrdiff_t)16*2+(ptrdiff_t)80*i128);
__m512 va104 = _mm512_loadu_ps(variances16+(ptrdiff_t)16*3+(ptrdiff_t)80*i128);
__m512 va105 = _mm512_loadu_ps(variances16+(ptrdiff_t)16*4+(ptrdiff_t)80*i128);
__m512 rcp101 = DenseNet121Rsqrt1(_mm512_add_ps(eps16, va101));
__m512 rcp102 = DenseNet121Rsqrt1(_mm512_add_ps(eps16, va102));
__m512 rcp103 = DenseNet121Rsqrt1(_mm512_add_ps(eps16, va103));
__m512 rcp104 = DenseNet121Rsqrt1(_mm512_add_ps(eps16, va104));
__m512 rcp105 = DenseNet121Rsqrt1(_mm512_add_ps(eps16, va105));
__m512 sc101 = _mm512_loadu_ps(scales16+(ptrdiff_t)16*0+(ptrdiff_t)80*i128);
__m512 sc102 = _mm512_loadu_ps(scales16+(ptrdiff_t)16*1+(ptrdiff_t)80*i128);
__m512 sc103 = _mm512_loadu_ps(scales16+(ptrdiff_t)16*2+(ptrdiff_t)80*i128);
__m512 sc104 = _mm512_loadu_ps(scales16+(ptrdiff_t)16*3+(ptrdiff_t)80*i128);
__m512 sc105 = _mm512_loadu_ps(scales16+(ptrdiff_t)16*4+(ptrdiff_t)80*i128);
__m512 mul101 = _mm512_mul_ps(rcp101, sc101);
__m512 mul102 = _mm512_mul_ps(rcp102, sc102);
__m512 mul103 = _mm512_mul_ps(rcp103, sc103);
__m512 mul104 = _mm512_mul_ps(rcp104, sc104);
__m512 mul105 = _mm512_mul_ps(rcp105, sc105);
__m512 me101 = _mm512_loadu_ps(means16+(ptrdiff_t)16*0+(ptrdiff_t)80*i128);
__m512 me102 = _mm512_loadu_ps(means16+(ptrdiff_t)16*1+(ptrdiff_t)80*i128);
__m512 me103 = _mm512_loadu_ps(means16+(ptrdiff_t)16*2+(ptrdiff_t)80*i128);
__m512 me104 = _mm512_loadu_ps(means16+(ptrdiff_t)16*3+(ptrdiff_t)80*i128);
__m512 me105 = _mm512_loadu_ps(means16+(ptrdiff_t)16*4+(ptrdiff_t)80*i128);
__m512 sh101 = _mm512_loadu_ps(shifts16+(ptrdiff_t)16*0+(ptrdiff_t)80*i128);
__m512 sh102 = _mm512_loadu_ps(shifts16+(ptrdiff_t)16*1+(ptrdiff_t)80*i128);
__m512 sh103 = _mm512_loadu_ps(shifts16+(ptrdiff_t)16*2+(ptrdiff_t)80*i128);
__m512 sh104 = _mm512_loadu_ps(shifts16+(ptrdiff_t)16*3+(ptrdiff_t)80*i128);
__m512 sh105 = _mm512_loadu_ps(shifts16+(ptrdiff_t)16*4+(ptrdiff_t)80*i128);
__m512 add101 = _mm512_fnmadd_ps(me101, mul101, sh101);
__m512 add102 = _mm512_fnmadd_ps(me102, mul102, sh102);
__m512 add103 = _mm512_fnmadd_ps(me103, mul103, sh103);
__m512 add104 = _mm512_fnmadd_ps(me104, mul104, sh104);
__m512 add105 = _mm512_fnmadd_ps(me105, mul105, sh105);
__m512 lo101 = _mm512_permutex2var_ps(mul101, xlo16, add101);
__m512 lo102 = _mm512_permutex2var_ps(mul102, xlo16, add102);
__m512 lo103 = _mm512_permutex2var_ps(mul103, xlo16, add103);
__m512 lo104 = _mm512_permutex2var_ps(mul104, xlo16, add104);
__m512 lo105 = _mm512_permutex2var_ps(mul105, xlo16, add105);
__m512 hi107 = _mm512_permutex2var_ps(mul101, xhi16, add101);
__m512 hi108 = _mm512_permutex2var_ps(mul102, xhi16, add102);
__m512 hi109 = _mm512_permutex2var_ps(mul103, xhi16, add103);
__m512 hi110 = _mm512_permutex2var_ps(mul104, xhi16, add104);
__m512 hi111 = _mm512_permutex2var_ps(mul105, xhi16, add105);
_mm512_storeu_ps(mas17+(ptrdiff_t)64*0+(ptrdiff_t)640*i128, lo101);
_mm512_storeu_ps(mas17+(ptrdiff_t)64*1+(ptrdiff_t)640*i128, hi107);
_mm512_storeu_ps(mas17+(ptrdiff_t)64*2+(ptrdiff_t)640*i128, lo102);
_mm512_storeu_ps(mas17+(ptrdiff_t)64*3+(ptrdiff_t)640*i128, hi108);
_mm512_storeu_ps(mas17+(ptrdiff_t)64*4+(ptrdiff_t)640*i128, lo103);
_mm512_storeu_ps(mas17+(ptrdiff_t)64*5+(ptrdiff_t)640*i128, hi109);
_mm512_storeu_ps(mas17+(ptrdiff_t)64*6+(ptrdiff_t)640*i128, lo104);
_mm512_storeu_ps(mas17+(ptrdiff_t)64*7+(ptrdiff_t)640*i128, hi110);
_mm512_storeu_ps(mas17+(ptrdiff_t)64*8+(ptrdiff_t)640*i128, lo105);
_mm512_storeu_ps(mas17+(ptrdiff_t)64*9+(ptrdiff_t)640*i128, hi111);
}
__m512 va106 = _mm512_loadu_ps(variances16+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 va107 = _mm512_loadu_ps(variances16+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 va108 = _mm512_loadu_ps(variances16+(ptrdiff_t)16*2+(ptrdiff_t)80*6);
__m512 va109 = _mm512_loadu_ps(variances16+(ptrdiff_t)16*3+(ptrdiff_t)80*6);
__m512 rcp106 = DenseNet121Rsqrt1(_mm512_add_ps(eps16, va106));
__m512 rcp107 = DenseNet121Rsqrt1(_mm512_add_ps(eps16, va107));
__m512 rcp108 = DenseNet121Rsqrt1(_mm512_add_ps(eps16, va108));
__m512 rcp109 = DenseNet121Rsqrt1(_mm512_add_ps(eps16, va109));
__m512 sc106 = _mm512_loadu_ps(scales16+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 sc107 = _mm512_loadu_ps(scales16+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 sc108 = _mm512_loadu_ps(scales16+(ptrdiff_t)16*2+(ptrdiff_t)80*6);
__m512 sc109 = _mm512_loadu_ps(scales16+(ptrdiff_t)16*3+(ptrdiff_t)80*6);
__m512 mul106 = _mm512_mul_ps(rcp106, sc106);
__m512 mul107 = _mm512_mul_ps(rcp107, sc107);
__m512 mul108 = _mm512_mul_ps(rcp108, sc108);
__m512 mul109 = _mm512_mul_ps(rcp109, sc109);
__m512 me106 = _mm512_loadu_ps(means16+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 me107 = _mm512_loadu_ps(means16+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 me108 = _mm512_loadu_ps(means16+(ptrdiff_t)16*2+(ptrdiff_t)80*6);
__m512 me109 = _mm512_loadu_ps(means16+(ptrdiff_t)16*3+(ptrdiff_t)80*6);
__m512 sh106 = _mm512_loadu_ps(shifts16+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 sh107 = _mm512_loadu_ps(shifts16+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 sh108 = _mm512_loadu_ps(shifts16+(ptrdiff_t)16*2+(ptrdiff_t)80*6);
__m512 sh109 = _mm512_loadu_ps(shifts16+(ptrdiff_t)16*3+(ptrdiff_t)80*6);
__m512 add106 = _mm512_fnmadd_ps(me106, mul106, sh106);
__m512 add107 = _mm512_fnmadd_ps(me107, mul107, sh107);
__m512 add108 = _mm512_fnmadd_ps(me108, mul108, sh108);
__m512 add109 = _mm512_fnmadd_ps(me109, mul109, sh109);
__m512 lo106 = _mm512_permutex2var_ps(mul106, xlo16, add106);
__m512 lo107 = _mm512_permutex2var_ps(mul107, xlo16, add107);
__m512 lo108 = _mm512_permutex2var_ps(mul108, xlo16, add108);
__m512 lo109 = _mm512_permutex2var_ps(mul109, xlo16, add109);
__m512 hi112 = _mm512_permutex2var_ps(mul106, xhi16, add106);
__m512 hi113 = _mm512_permutex2var_ps(mul107, xhi16, add107);
__m512 hi114 = _mm512_permutex2var_ps(mul108, xhi16, add108);
__m512 hi115 = _mm512_permutex2var_ps(mul109, xhi16, add109);
_mm512_storeu_ps(mas17+(ptrdiff_t)64*0+(ptrdiff_t)640*6, lo106);
_mm512_storeu_ps(mas17+(ptrdiff_t)64*1+(ptrdiff_t)640*6, hi112);
_mm512_storeu_ps(mas17+(ptrdiff_t)64*2+(ptrdiff_t)640*6, lo107);
_mm512_storeu_ps(mas17+(ptrdiff_t)64*3+(ptrdiff_t)640*6, hi113);
_mm512_storeu_ps(mas17+(ptrdiff_t)64*4+(ptrdiff_t)640*6, lo108);
_mm512_storeu_ps(mas17+(ptrdiff_t)64*5+(ptrdiff_t)640*6, hi114);
_mm512_storeu_ps(mas17+(ptrdiff_t)64*6+(ptrdiff_t)640*6, lo109);
_mm512_storeu_ps(mas17+(ptrdiff_t)64*7+(ptrdiff_t)640*6, hi115);
}

static void DenseNet121BnSimplify17(
float*restrict means17,
float*restrict variances17,
float*restrict scales17,
float*restrict shifts17,
char*restrict mas18
) {
__m512 eps17 = _mm512_set1_ps(1e-05f);
__m512i xlo17 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi17 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i132 = 0; i132 < 7; ++i132) {
__m512 va110 = _mm512_loadu_ps(variances17+(ptrdiff_t)16*0+(ptrdiff_t)80*i132);
__m512 va111 = _mm512_loadu_ps(variances17+(ptrdiff_t)16*1+(ptrdiff_t)80*i132);
__m512 va112 = _mm512_loadu_ps(variances17+(ptrdiff_t)16*2+(ptrdiff_t)80*i132);
__m512 va113 = _mm512_loadu_ps(variances17+(ptrdiff_t)16*3+(ptrdiff_t)80*i132);
__m512 va114 = _mm512_loadu_ps(variances17+(ptrdiff_t)16*4+(ptrdiff_t)80*i132);
__m512 rcp110 = DenseNet121Rsqrt1(_mm512_add_ps(eps17, va110));
__m512 rcp111 = DenseNet121Rsqrt1(_mm512_add_ps(eps17, va111));
__m512 rcp112 = DenseNet121Rsqrt1(_mm512_add_ps(eps17, va112));
__m512 rcp113 = DenseNet121Rsqrt1(_mm512_add_ps(eps17, va113));
__m512 rcp114 = DenseNet121Rsqrt1(_mm512_add_ps(eps17, va114));
__m512 sc110 = _mm512_loadu_ps(scales17+(ptrdiff_t)16*0+(ptrdiff_t)80*i132);
__m512 sc111 = _mm512_loadu_ps(scales17+(ptrdiff_t)16*1+(ptrdiff_t)80*i132);
__m512 sc112 = _mm512_loadu_ps(scales17+(ptrdiff_t)16*2+(ptrdiff_t)80*i132);
__m512 sc113 = _mm512_loadu_ps(scales17+(ptrdiff_t)16*3+(ptrdiff_t)80*i132);
__m512 sc114 = _mm512_loadu_ps(scales17+(ptrdiff_t)16*4+(ptrdiff_t)80*i132);
__m512 mul110 = _mm512_mul_ps(rcp110, sc110);
__m512 mul111 = _mm512_mul_ps(rcp111, sc111);
__m512 mul112 = _mm512_mul_ps(rcp112, sc112);
__m512 mul113 = _mm512_mul_ps(rcp113, sc113);
__m512 mul114 = _mm512_mul_ps(rcp114, sc114);
__m512 me110 = _mm512_loadu_ps(means17+(ptrdiff_t)16*0+(ptrdiff_t)80*i132);
__m512 me111 = _mm512_loadu_ps(means17+(ptrdiff_t)16*1+(ptrdiff_t)80*i132);
__m512 me112 = _mm512_loadu_ps(means17+(ptrdiff_t)16*2+(ptrdiff_t)80*i132);
__m512 me113 = _mm512_loadu_ps(means17+(ptrdiff_t)16*3+(ptrdiff_t)80*i132);
__m512 me114 = _mm512_loadu_ps(means17+(ptrdiff_t)16*4+(ptrdiff_t)80*i132);
__m512 sh110 = _mm512_loadu_ps(shifts17+(ptrdiff_t)16*0+(ptrdiff_t)80*i132);
__m512 sh111 = _mm512_loadu_ps(shifts17+(ptrdiff_t)16*1+(ptrdiff_t)80*i132);
__m512 sh112 = _mm512_loadu_ps(shifts17+(ptrdiff_t)16*2+(ptrdiff_t)80*i132);
__m512 sh113 = _mm512_loadu_ps(shifts17+(ptrdiff_t)16*3+(ptrdiff_t)80*i132);
__m512 sh114 = _mm512_loadu_ps(shifts17+(ptrdiff_t)16*4+(ptrdiff_t)80*i132);
__m512 add110 = _mm512_fnmadd_ps(me110, mul110, sh110);
__m512 add111 = _mm512_fnmadd_ps(me111, mul111, sh111);
__m512 add112 = _mm512_fnmadd_ps(me112, mul112, sh112);
__m512 add113 = _mm512_fnmadd_ps(me113, mul113, sh113);
__m512 add114 = _mm512_fnmadd_ps(me114, mul114, sh114);
__m512 lo110 = _mm512_permutex2var_ps(mul110, xlo17, add110);
__m512 lo111 = _mm512_permutex2var_ps(mul111, xlo17, add111);
__m512 lo112 = _mm512_permutex2var_ps(mul112, xlo17, add112);
__m512 lo113 = _mm512_permutex2var_ps(mul113, xlo17, add113);
__m512 lo114 = _mm512_permutex2var_ps(mul114, xlo17, add114);
__m512 hi116 = _mm512_permutex2var_ps(mul110, xhi17, add110);
__m512 hi117 = _mm512_permutex2var_ps(mul111, xhi17, add111);
__m512 hi118 = _mm512_permutex2var_ps(mul112, xhi17, add112);
__m512 hi119 = _mm512_permutex2var_ps(mul113, xhi17, add113);
__m512 hi120 = _mm512_permutex2var_ps(mul114, xhi17, add114);
_mm512_storeu_ps(mas18+(ptrdiff_t)64*0+(ptrdiff_t)640*i132, lo110);
_mm512_storeu_ps(mas18+(ptrdiff_t)64*1+(ptrdiff_t)640*i132, hi116);
_mm512_storeu_ps(mas18+(ptrdiff_t)64*2+(ptrdiff_t)640*i132, lo111);
_mm512_storeu_ps(mas18+(ptrdiff_t)64*3+(ptrdiff_t)640*i132, hi117);
_mm512_storeu_ps(mas18+(ptrdiff_t)64*4+(ptrdiff_t)640*i132, lo112);
_mm512_storeu_ps(mas18+(ptrdiff_t)64*5+(ptrdiff_t)640*i132, hi118);
_mm512_storeu_ps(mas18+(ptrdiff_t)64*6+(ptrdiff_t)640*i132, lo113);
_mm512_storeu_ps(mas18+(ptrdiff_t)64*7+(ptrdiff_t)640*i132, hi119);
_mm512_storeu_ps(mas18+(ptrdiff_t)64*8+(ptrdiff_t)640*i132, lo114);
_mm512_storeu_ps(mas18+(ptrdiff_t)64*9+(ptrdiff_t)640*i132, hi120);
}
__m512 va115 = _mm512_loadu_ps(variances17+(ptrdiff_t)16*0+(ptrdiff_t)80*7);
__m512 rcp115 = DenseNet121Rsqrt1(_mm512_add_ps(eps17, va115));
__m512 sc115 = _mm512_loadu_ps(scales17+(ptrdiff_t)16*0+(ptrdiff_t)80*7);
__m512 mul115 = _mm512_mul_ps(rcp115, sc115);
__m512 me115 = _mm512_loadu_ps(means17+(ptrdiff_t)16*0+(ptrdiff_t)80*7);
__m512 sh115 = _mm512_loadu_ps(shifts17+(ptrdiff_t)16*0+(ptrdiff_t)80*7);
__m512 add115 = _mm512_fnmadd_ps(me115, mul115, sh115);
__m512 lo115 = _mm512_permutex2var_ps(mul115, xlo17, add115);
__m512 hi121 = _mm512_permutex2var_ps(mul115, xhi17, add115);
_mm512_storeu_ps(mas18+(ptrdiff_t)64*0+(ptrdiff_t)640*7, lo115);
_mm512_storeu_ps(mas18+(ptrdiff_t)64*1+(ptrdiff_t)640*7, hi121);
}

static void DenseNet121BnSimplify18(
float*restrict means18,
float*restrict variances18,
float*restrict scales18,
float*restrict shifts18,
char*restrict mas19
) {
__m512 eps18 = _mm512_set1_ps(1e-05f);
__m512i xlo18 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi18 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i136 = 0; i136 < 7; ++i136) {
__m512 va116 = _mm512_loadu_ps(variances18+(ptrdiff_t)16*0+(ptrdiff_t)80*i136);
__m512 va117 = _mm512_loadu_ps(variances18+(ptrdiff_t)16*1+(ptrdiff_t)80*i136);
__m512 va118 = _mm512_loadu_ps(variances18+(ptrdiff_t)16*2+(ptrdiff_t)80*i136);
__m512 va119 = _mm512_loadu_ps(variances18+(ptrdiff_t)16*3+(ptrdiff_t)80*i136);
__m512 va120 = _mm512_loadu_ps(variances18+(ptrdiff_t)16*4+(ptrdiff_t)80*i136);
__m512 rcp116 = DenseNet121Rsqrt1(_mm512_add_ps(eps18, va116));
__m512 rcp117 = DenseNet121Rsqrt1(_mm512_add_ps(eps18, va117));
__m512 rcp118 = DenseNet121Rsqrt1(_mm512_add_ps(eps18, va118));
__m512 rcp119 = DenseNet121Rsqrt1(_mm512_add_ps(eps18, va119));
__m512 rcp120 = DenseNet121Rsqrt1(_mm512_add_ps(eps18, va120));
__m512 sc116 = _mm512_loadu_ps(scales18+(ptrdiff_t)16*0+(ptrdiff_t)80*i136);
__m512 sc117 = _mm512_loadu_ps(scales18+(ptrdiff_t)16*1+(ptrdiff_t)80*i136);
__m512 sc118 = _mm512_loadu_ps(scales18+(ptrdiff_t)16*2+(ptrdiff_t)80*i136);
__m512 sc119 = _mm512_loadu_ps(scales18+(ptrdiff_t)16*3+(ptrdiff_t)80*i136);
__m512 sc120 = _mm512_loadu_ps(scales18+(ptrdiff_t)16*4+(ptrdiff_t)80*i136);
__m512 mul116 = _mm512_mul_ps(rcp116, sc116);
__m512 mul117 = _mm512_mul_ps(rcp117, sc117);
__m512 mul118 = _mm512_mul_ps(rcp118, sc118);
__m512 mul119 = _mm512_mul_ps(rcp119, sc119);
__m512 mul120 = _mm512_mul_ps(rcp120, sc120);
__m512 me116 = _mm512_loadu_ps(means18+(ptrdiff_t)16*0+(ptrdiff_t)80*i136);
__m512 me117 = _mm512_loadu_ps(means18+(ptrdiff_t)16*1+(ptrdiff_t)80*i136);
__m512 me118 = _mm512_loadu_ps(means18+(ptrdiff_t)16*2+(ptrdiff_t)80*i136);
__m512 me119 = _mm512_loadu_ps(means18+(ptrdiff_t)16*3+(ptrdiff_t)80*i136);
__m512 me120 = _mm512_loadu_ps(means18+(ptrdiff_t)16*4+(ptrdiff_t)80*i136);
__m512 sh116 = _mm512_loadu_ps(shifts18+(ptrdiff_t)16*0+(ptrdiff_t)80*i136);
__m512 sh117 = _mm512_loadu_ps(shifts18+(ptrdiff_t)16*1+(ptrdiff_t)80*i136);
__m512 sh118 = _mm512_loadu_ps(shifts18+(ptrdiff_t)16*2+(ptrdiff_t)80*i136);
__m512 sh119 = _mm512_loadu_ps(shifts18+(ptrdiff_t)16*3+(ptrdiff_t)80*i136);
__m512 sh120 = _mm512_loadu_ps(shifts18+(ptrdiff_t)16*4+(ptrdiff_t)80*i136);
__m512 add116 = _mm512_fnmadd_ps(me116, mul116, sh116);
__m512 add117 = _mm512_fnmadd_ps(me117, mul117, sh117);
__m512 add118 = _mm512_fnmadd_ps(me118, mul118, sh118);
__m512 add119 = _mm512_fnmadd_ps(me119, mul119, sh119);
__m512 add120 = _mm512_fnmadd_ps(me120, mul120, sh120);
__m512 lo116 = _mm512_permutex2var_ps(mul116, xlo18, add116);
__m512 lo117 = _mm512_permutex2var_ps(mul117, xlo18, add117);
__m512 lo118 = _mm512_permutex2var_ps(mul118, xlo18, add118);
__m512 lo119 = _mm512_permutex2var_ps(mul119, xlo18, add119);
__m512 lo120 = _mm512_permutex2var_ps(mul120, xlo18, add120);
__m512 hi122 = _mm512_permutex2var_ps(mul116, xhi18, add116);
__m512 hi123 = _mm512_permutex2var_ps(mul117, xhi18, add117);
__m512 hi124 = _mm512_permutex2var_ps(mul118, xhi18, add118);
__m512 hi125 = _mm512_permutex2var_ps(mul119, xhi18, add119);
__m512 hi126 = _mm512_permutex2var_ps(mul120, xhi18, add120);
_mm512_storeu_ps(mas19+(ptrdiff_t)64*0+(ptrdiff_t)640*i136, lo116);
_mm512_storeu_ps(mas19+(ptrdiff_t)64*1+(ptrdiff_t)640*i136, hi122);
_mm512_storeu_ps(mas19+(ptrdiff_t)64*2+(ptrdiff_t)640*i136, lo117);
_mm512_storeu_ps(mas19+(ptrdiff_t)64*3+(ptrdiff_t)640*i136, hi123);
_mm512_storeu_ps(mas19+(ptrdiff_t)64*4+(ptrdiff_t)640*i136, lo118);
_mm512_storeu_ps(mas19+(ptrdiff_t)64*5+(ptrdiff_t)640*i136, hi124);
_mm512_storeu_ps(mas19+(ptrdiff_t)64*6+(ptrdiff_t)640*i136, lo119);
_mm512_storeu_ps(mas19+(ptrdiff_t)64*7+(ptrdiff_t)640*i136, hi125);
_mm512_storeu_ps(mas19+(ptrdiff_t)64*8+(ptrdiff_t)640*i136, lo120);
_mm512_storeu_ps(mas19+(ptrdiff_t)64*9+(ptrdiff_t)640*i136, hi126);
}
__m512 va121 = _mm512_loadu_ps(variances18+(ptrdiff_t)16*0+(ptrdiff_t)80*7);
__m512 va122 = _mm512_loadu_ps(variances18+(ptrdiff_t)16*1+(ptrdiff_t)80*7);
__m512 va123 = _mm512_loadu_ps(variances18+(ptrdiff_t)16*2+(ptrdiff_t)80*7);
__m512 rcp121 = DenseNet121Rsqrt1(_mm512_add_ps(eps18, va121));
__m512 rcp122 = DenseNet121Rsqrt1(_mm512_add_ps(eps18, va122));
__m512 rcp123 = DenseNet121Rsqrt1(_mm512_add_ps(eps18, va123));
__m512 sc121 = _mm512_loadu_ps(scales18+(ptrdiff_t)16*0+(ptrdiff_t)80*7);
__m512 sc122 = _mm512_loadu_ps(scales18+(ptrdiff_t)16*1+(ptrdiff_t)80*7);
__m512 sc123 = _mm512_loadu_ps(scales18+(ptrdiff_t)16*2+(ptrdiff_t)80*7);
__m512 mul121 = _mm512_mul_ps(rcp121, sc121);
__m512 mul122 = _mm512_mul_ps(rcp122, sc122);
__m512 mul123 = _mm512_mul_ps(rcp123, sc123);
__m512 me121 = _mm512_loadu_ps(means18+(ptrdiff_t)16*0+(ptrdiff_t)80*7);
__m512 me122 = _mm512_loadu_ps(means18+(ptrdiff_t)16*1+(ptrdiff_t)80*7);
__m512 me123 = _mm512_loadu_ps(means18+(ptrdiff_t)16*2+(ptrdiff_t)80*7);
__m512 sh121 = _mm512_loadu_ps(shifts18+(ptrdiff_t)16*0+(ptrdiff_t)80*7);
__m512 sh122 = _mm512_loadu_ps(shifts18+(ptrdiff_t)16*1+(ptrdiff_t)80*7);
__m512 sh123 = _mm512_loadu_ps(shifts18+(ptrdiff_t)16*2+(ptrdiff_t)80*7);
__m512 add121 = _mm512_fnmadd_ps(me121, mul121, sh121);
__m512 add122 = _mm512_fnmadd_ps(me122, mul122, sh122);
__m512 add123 = _mm512_fnmadd_ps(me123, mul123, sh123);
__m512 lo121 = _mm512_permutex2var_ps(mul121, xlo18, add121);
__m512 lo122 = _mm512_permutex2var_ps(mul122, xlo18, add122);
__m512 lo123 = _mm512_permutex2var_ps(mul123, xlo18, add123);
__m512 hi127 = _mm512_permutex2var_ps(mul121, xhi18, add121);
__m512 hi128 = _mm512_permutex2var_ps(mul122, xhi18, add122);
__m512 hi129 = _mm512_permutex2var_ps(mul123, xhi18, add123);
_mm512_storeu_ps(mas19+(ptrdiff_t)64*0+(ptrdiff_t)640*7, lo121);
_mm512_storeu_ps(mas19+(ptrdiff_t)64*1+(ptrdiff_t)640*7, hi127);
_mm512_storeu_ps(mas19+(ptrdiff_t)64*2+(ptrdiff_t)640*7, lo122);
_mm512_storeu_ps(mas19+(ptrdiff_t)64*3+(ptrdiff_t)640*7, hi128);
_mm512_storeu_ps(mas19+(ptrdiff_t)64*4+(ptrdiff_t)640*7, lo123);
_mm512_storeu_ps(mas19+(ptrdiff_t)64*5+(ptrdiff_t)640*7, hi129);
}

static void DenseNet121BnSimplify19(
float*restrict means19,
float*restrict variances19,
float*restrict scales19,
float*restrict shifts19,
char*restrict mas20
) {
__m512 eps19 = _mm512_set1_ps(1e-05f);
__m512i xlo19 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi19 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i140 = 0; i140 < 8; ++i140) {
__m512 va124 = _mm512_loadu_ps(variances19+(ptrdiff_t)16*0+(ptrdiff_t)80*i140);
__m512 va125 = _mm512_loadu_ps(variances19+(ptrdiff_t)16*1+(ptrdiff_t)80*i140);
__m512 va126 = _mm512_loadu_ps(variances19+(ptrdiff_t)16*2+(ptrdiff_t)80*i140);
__m512 va127 = _mm512_loadu_ps(variances19+(ptrdiff_t)16*3+(ptrdiff_t)80*i140);
__m512 va128 = _mm512_loadu_ps(variances19+(ptrdiff_t)16*4+(ptrdiff_t)80*i140);
__m512 rcp124 = DenseNet121Rsqrt1(_mm512_add_ps(eps19, va124));
__m512 rcp125 = DenseNet121Rsqrt1(_mm512_add_ps(eps19, va125));
__m512 rcp126 = DenseNet121Rsqrt1(_mm512_add_ps(eps19, va126));
__m512 rcp127 = DenseNet121Rsqrt1(_mm512_add_ps(eps19, va127));
__m512 rcp128 = DenseNet121Rsqrt1(_mm512_add_ps(eps19, va128));
__m512 sc124 = _mm512_loadu_ps(scales19+(ptrdiff_t)16*0+(ptrdiff_t)80*i140);
__m512 sc125 = _mm512_loadu_ps(scales19+(ptrdiff_t)16*1+(ptrdiff_t)80*i140);
__m512 sc126 = _mm512_loadu_ps(scales19+(ptrdiff_t)16*2+(ptrdiff_t)80*i140);
__m512 sc127 = _mm512_loadu_ps(scales19+(ptrdiff_t)16*3+(ptrdiff_t)80*i140);
__m512 sc128 = _mm512_loadu_ps(scales19+(ptrdiff_t)16*4+(ptrdiff_t)80*i140);
__m512 mul124 = _mm512_mul_ps(rcp124, sc124);
__m512 mul125 = _mm512_mul_ps(rcp125, sc125);
__m512 mul126 = _mm512_mul_ps(rcp126, sc126);
__m512 mul127 = _mm512_mul_ps(rcp127, sc127);
__m512 mul128 = _mm512_mul_ps(rcp128, sc128);
__m512 me124 = _mm512_loadu_ps(means19+(ptrdiff_t)16*0+(ptrdiff_t)80*i140);
__m512 me125 = _mm512_loadu_ps(means19+(ptrdiff_t)16*1+(ptrdiff_t)80*i140);
__m512 me126 = _mm512_loadu_ps(means19+(ptrdiff_t)16*2+(ptrdiff_t)80*i140);
__m512 me127 = _mm512_loadu_ps(means19+(ptrdiff_t)16*3+(ptrdiff_t)80*i140);
__m512 me128 = _mm512_loadu_ps(means19+(ptrdiff_t)16*4+(ptrdiff_t)80*i140);
__m512 sh124 = _mm512_loadu_ps(shifts19+(ptrdiff_t)16*0+(ptrdiff_t)80*i140);
__m512 sh125 = _mm512_loadu_ps(shifts19+(ptrdiff_t)16*1+(ptrdiff_t)80*i140);
__m512 sh126 = _mm512_loadu_ps(shifts19+(ptrdiff_t)16*2+(ptrdiff_t)80*i140);
__m512 sh127 = _mm512_loadu_ps(shifts19+(ptrdiff_t)16*3+(ptrdiff_t)80*i140);
__m512 sh128 = _mm512_loadu_ps(shifts19+(ptrdiff_t)16*4+(ptrdiff_t)80*i140);
__m512 add124 = _mm512_fnmadd_ps(me124, mul124, sh124);
__m512 add125 = _mm512_fnmadd_ps(me125, mul125, sh125);
__m512 add126 = _mm512_fnmadd_ps(me126, mul126, sh126);
__m512 add127 = _mm512_fnmadd_ps(me127, mul127, sh127);
__m512 add128 = _mm512_fnmadd_ps(me128, mul128, sh128);
__m512 lo124 = _mm512_permutex2var_ps(mul124, xlo19, add124);
__m512 lo125 = _mm512_permutex2var_ps(mul125, xlo19, add125);
__m512 lo126 = _mm512_permutex2var_ps(mul126, xlo19, add126);
__m512 lo127 = _mm512_permutex2var_ps(mul127, xlo19, add127);
__m512 lo128 = _mm512_permutex2var_ps(mul128, xlo19, add128);
__m512 hi130 = _mm512_permutex2var_ps(mul124, xhi19, add124);
__m512 hi131 = _mm512_permutex2var_ps(mul125, xhi19, add125);
__m512 hi132 = _mm512_permutex2var_ps(mul126, xhi19, add126);
__m512 hi133 = _mm512_permutex2var_ps(mul127, xhi19, add127);
__m512 hi134 = _mm512_permutex2var_ps(mul128, xhi19, add128);
_mm512_storeu_ps(mas20+(ptrdiff_t)64*0+(ptrdiff_t)640*i140, lo124);
_mm512_storeu_ps(mas20+(ptrdiff_t)64*1+(ptrdiff_t)640*i140, hi130);
_mm512_storeu_ps(mas20+(ptrdiff_t)64*2+(ptrdiff_t)640*i140, lo125);
_mm512_storeu_ps(mas20+(ptrdiff_t)64*3+(ptrdiff_t)640*i140, hi131);
_mm512_storeu_ps(mas20+(ptrdiff_t)64*4+(ptrdiff_t)640*i140, lo126);
_mm512_storeu_ps(mas20+(ptrdiff_t)64*5+(ptrdiff_t)640*i140, hi132);
_mm512_storeu_ps(mas20+(ptrdiff_t)64*6+(ptrdiff_t)640*i140, lo127);
_mm512_storeu_ps(mas20+(ptrdiff_t)64*7+(ptrdiff_t)640*i140, hi133);
_mm512_storeu_ps(mas20+(ptrdiff_t)64*8+(ptrdiff_t)640*i140, lo128);
_mm512_storeu_ps(mas20+(ptrdiff_t)64*9+(ptrdiff_t)640*i140, hi134);
}
}

static void DenseNet121BnSimplify20(
float*restrict means20,
float*restrict variances20,
float*restrict scales20,
float*restrict shifts20,
char*restrict mas21
) {
__m512 eps20 = _mm512_set1_ps(1e-05f);
__m512i xlo20 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi20 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i144 = 0; i144 < 8; ++i144) {
__m512 va129 = _mm512_loadu_ps(variances20+(ptrdiff_t)16*0+(ptrdiff_t)80*i144);
__m512 va130 = _mm512_loadu_ps(variances20+(ptrdiff_t)16*1+(ptrdiff_t)80*i144);
__m512 va131 = _mm512_loadu_ps(variances20+(ptrdiff_t)16*2+(ptrdiff_t)80*i144);
__m512 va132 = _mm512_loadu_ps(variances20+(ptrdiff_t)16*3+(ptrdiff_t)80*i144);
__m512 va133 = _mm512_loadu_ps(variances20+(ptrdiff_t)16*4+(ptrdiff_t)80*i144);
__m512 rcp129 = DenseNet121Rsqrt1(_mm512_add_ps(eps20, va129));
__m512 rcp130 = DenseNet121Rsqrt1(_mm512_add_ps(eps20, va130));
__m512 rcp131 = DenseNet121Rsqrt1(_mm512_add_ps(eps20, va131));
__m512 rcp132 = DenseNet121Rsqrt1(_mm512_add_ps(eps20, va132));
__m512 rcp133 = DenseNet121Rsqrt1(_mm512_add_ps(eps20, va133));
__m512 sc129 = _mm512_loadu_ps(scales20+(ptrdiff_t)16*0+(ptrdiff_t)80*i144);
__m512 sc130 = _mm512_loadu_ps(scales20+(ptrdiff_t)16*1+(ptrdiff_t)80*i144);
__m512 sc131 = _mm512_loadu_ps(scales20+(ptrdiff_t)16*2+(ptrdiff_t)80*i144);
__m512 sc132 = _mm512_loadu_ps(scales20+(ptrdiff_t)16*3+(ptrdiff_t)80*i144);
__m512 sc133 = _mm512_loadu_ps(scales20+(ptrdiff_t)16*4+(ptrdiff_t)80*i144);
__m512 mul129 = _mm512_mul_ps(rcp129, sc129);
__m512 mul130 = _mm512_mul_ps(rcp130, sc130);
__m512 mul131 = _mm512_mul_ps(rcp131, sc131);
__m512 mul132 = _mm512_mul_ps(rcp132, sc132);
__m512 mul133 = _mm512_mul_ps(rcp133, sc133);
__m512 me129 = _mm512_loadu_ps(means20+(ptrdiff_t)16*0+(ptrdiff_t)80*i144);
__m512 me130 = _mm512_loadu_ps(means20+(ptrdiff_t)16*1+(ptrdiff_t)80*i144);
__m512 me131 = _mm512_loadu_ps(means20+(ptrdiff_t)16*2+(ptrdiff_t)80*i144);
__m512 me132 = _mm512_loadu_ps(means20+(ptrdiff_t)16*3+(ptrdiff_t)80*i144);
__m512 me133 = _mm512_loadu_ps(means20+(ptrdiff_t)16*4+(ptrdiff_t)80*i144);
__m512 sh129 = _mm512_loadu_ps(shifts20+(ptrdiff_t)16*0+(ptrdiff_t)80*i144);
__m512 sh130 = _mm512_loadu_ps(shifts20+(ptrdiff_t)16*1+(ptrdiff_t)80*i144);
__m512 sh131 = _mm512_loadu_ps(shifts20+(ptrdiff_t)16*2+(ptrdiff_t)80*i144);
__m512 sh132 = _mm512_loadu_ps(shifts20+(ptrdiff_t)16*3+(ptrdiff_t)80*i144);
__m512 sh133 = _mm512_loadu_ps(shifts20+(ptrdiff_t)16*4+(ptrdiff_t)80*i144);
__m512 add129 = _mm512_fnmadd_ps(me129, mul129, sh129);
__m512 add130 = _mm512_fnmadd_ps(me130, mul130, sh130);
__m512 add131 = _mm512_fnmadd_ps(me131, mul131, sh131);
__m512 add132 = _mm512_fnmadd_ps(me132, mul132, sh132);
__m512 add133 = _mm512_fnmadd_ps(me133, mul133, sh133);
__m512 lo129 = _mm512_permutex2var_ps(mul129, xlo20, add129);
__m512 lo130 = _mm512_permutex2var_ps(mul130, xlo20, add130);
__m512 lo131 = _mm512_permutex2var_ps(mul131, xlo20, add131);
__m512 lo132 = _mm512_permutex2var_ps(mul132, xlo20, add132);
__m512 lo133 = _mm512_permutex2var_ps(mul133, xlo20, add133);
__m512 hi135 = _mm512_permutex2var_ps(mul129, xhi20, add129);
__m512 hi136 = _mm512_permutex2var_ps(mul130, xhi20, add130);
__m512 hi137 = _mm512_permutex2var_ps(mul131, xhi20, add131);
__m512 hi138 = _mm512_permutex2var_ps(mul132, xhi20, add132);
__m512 hi139 = _mm512_permutex2var_ps(mul133, xhi20, add133);
_mm512_storeu_ps(mas21+(ptrdiff_t)64*0+(ptrdiff_t)640*i144, lo129);
_mm512_storeu_ps(mas21+(ptrdiff_t)64*1+(ptrdiff_t)640*i144, hi135);
_mm512_storeu_ps(mas21+(ptrdiff_t)64*2+(ptrdiff_t)640*i144, lo130);
_mm512_storeu_ps(mas21+(ptrdiff_t)64*3+(ptrdiff_t)640*i144, hi136);
_mm512_storeu_ps(mas21+(ptrdiff_t)64*4+(ptrdiff_t)640*i144, lo131);
_mm512_storeu_ps(mas21+(ptrdiff_t)64*5+(ptrdiff_t)640*i144, hi137);
_mm512_storeu_ps(mas21+(ptrdiff_t)64*6+(ptrdiff_t)640*i144, lo132);
_mm512_storeu_ps(mas21+(ptrdiff_t)64*7+(ptrdiff_t)640*i144, hi138);
_mm512_storeu_ps(mas21+(ptrdiff_t)64*8+(ptrdiff_t)640*i144, lo133);
_mm512_storeu_ps(mas21+(ptrdiff_t)64*9+(ptrdiff_t)640*i144, hi139);
}
__m512 va134 = _mm512_loadu_ps(variances20+(ptrdiff_t)16*0+(ptrdiff_t)80*8);
__m512 va135 = _mm512_loadu_ps(variances20+(ptrdiff_t)16*1+(ptrdiff_t)80*8);
__m512 rcp134 = DenseNet121Rsqrt1(_mm512_add_ps(eps20, va134));
__m512 rcp135 = DenseNet121Rsqrt1(_mm512_add_ps(eps20, va135));
__m512 sc134 = _mm512_loadu_ps(scales20+(ptrdiff_t)16*0+(ptrdiff_t)80*8);
__m512 sc135 = _mm512_loadu_ps(scales20+(ptrdiff_t)16*1+(ptrdiff_t)80*8);
__m512 mul134 = _mm512_mul_ps(rcp134, sc134);
__m512 mul135 = _mm512_mul_ps(rcp135, sc135);
__m512 me134 = _mm512_loadu_ps(means20+(ptrdiff_t)16*0+(ptrdiff_t)80*8);
__m512 me135 = _mm512_loadu_ps(means20+(ptrdiff_t)16*1+(ptrdiff_t)80*8);
__m512 sh134 = _mm512_loadu_ps(shifts20+(ptrdiff_t)16*0+(ptrdiff_t)80*8);
__m512 sh135 = _mm512_loadu_ps(shifts20+(ptrdiff_t)16*1+(ptrdiff_t)80*8);
__m512 add134 = _mm512_fnmadd_ps(me134, mul134, sh134);
__m512 add135 = _mm512_fnmadd_ps(me135, mul135, sh135);
__m512 lo134 = _mm512_permutex2var_ps(mul134, xlo20, add134);
__m512 lo135 = _mm512_permutex2var_ps(mul135, xlo20, add135);
__m512 hi140 = _mm512_permutex2var_ps(mul134, xhi20, add134);
__m512 hi141 = _mm512_permutex2var_ps(mul135, xhi20, add135);
_mm512_storeu_ps(mas21+(ptrdiff_t)64*0+(ptrdiff_t)640*8, lo134);
_mm512_storeu_ps(mas21+(ptrdiff_t)64*1+(ptrdiff_t)640*8, hi140);
_mm512_storeu_ps(mas21+(ptrdiff_t)64*2+(ptrdiff_t)640*8, lo135);
_mm512_storeu_ps(mas21+(ptrdiff_t)64*3+(ptrdiff_t)640*8, hi141);
}

static void DenseNet121BnSimplify21(
float*restrict means21,
float*restrict variances21,
float*restrict scales21,
float*restrict shifts21,
char*restrict mas22
) {
__m512 eps21 = _mm512_set1_ps(1e-05f);
__m512i xlo21 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi21 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i148 = 0; i148 < 8; ++i148) {
__m512 va136 = _mm512_loadu_ps(variances21+(ptrdiff_t)16*0+(ptrdiff_t)80*i148);
__m512 va137 = _mm512_loadu_ps(variances21+(ptrdiff_t)16*1+(ptrdiff_t)80*i148);
__m512 va138 = _mm512_loadu_ps(variances21+(ptrdiff_t)16*2+(ptrdiff_t)80*i148);
__m512 va139 = _mm512_loadu_ps(variances21+(ptrdiff_t)16*3+(ptrdiff_t)80*i148);
__m512 va140 = _mm512_loadu_ps(variances21+(ptrdiff_t)16*4+(ptrdiff_t)80*i148);
__m512 rcp136 = DenseNet121Rsqrt1(_mm512_add_ps(eps21, va136));
__m512 rcp137 = DenseNet121Rsqrt1(_mm512_add_ps(eps21, va137));
__m512 rcp138 = DenseNet121Rsqrt1(_mm512_add_ps(eps21, va138));
__m512 rcp139 = DenseNet121Rsqrt1(_mm512_add_ps(eps21, va139));
__m512 rcp140 = DenseNet121Rsqrt1(_mm512_add_ps(eps21, va140));
__m512 sc136 = _mm512_loadu_ps(scales21+(ptrdiff_t)16*0+(ptrdiff_t)80*i148);
__m512 sc137 = _mm512_loadu_ps(scales21+(ptrdiff_t)16*1+(ptrdiff_t)80*i148);
__m512 sc138 = _mm512_loadu_ps(scales21+(ptrdiff_t)16*2+(ptrdiff_t)80*i148);
__m512 sc139 = _mm512_loadu_ps(scales21+(ptrdiff_t)16*3+(ptrdiff_t)80*i148);
__m512 sc140 = _mm512_loadu_ps(scales21+(ptrdiff_t)16*4+(ptrdiff_t)80*i148);
__m512 mul136 = _mm512_mul_ps(rcp136, sc136);
__m512 mul137 = _mm512_mul_ps(rcp137, sc137);
__m512 mul138 = _mm512_mul_ps(rcp138, sc138);
__m512 mul139 = _mm512_mul_ps(rcp139, sc139);
__m512 mul140 = _mm512_mul_ps(rcp140, sc140);
__m512 me136 = _mm512_loadu_ps(means21+(ptrdiff_t)16*0+(ptrdiff_t)80*i148);
__m512 me137 = _mm512_loadu_ps(means21+(ptrdiff_t)16*1+(ptrdiff_t)80*i148);
__m512 me138 = _mm512_loadu_ps(means21+(ptrdiff_t)16*2+(ptrdiff_t)80*i148);
__m512 me139 = _mm512_loadu_ps(means21+(ptrdiff_t)16*3+(ptrdiff_t)80*i148);
__m512 me140 = _mm512_loadu_ps(means21+(ptrdiff_t)16*4+(ptrdiff_t)80*i148);
__m512 sh136 = _mm512_loadu_ps(shifts21+(ptrdiff_t)16*0+(ptrdiff_t)80*i148);
__m512 sh137 = _mm512_loadu_ps(shifts21+(ptrdiff_t)16*1+(ptrdiff_t)80*i148);
__m512 sh138 = _mm512_loadu_ps(shifts21+(ptrdiff_t)16*2+(ptrdiff_t)80*i148);
__m512 sh139 = _mm512_loadu_ps(shifts21+(ptrdiff_t)16*3+(ptrdiff_t)80*i148);
__m512 sh140 = _mm512_loadu_ps(shifts21+(ptrdiff_t)16*4+(ptrdiff_t)80*i148);
__m512 add136 = _mm512_fnmadd_ps(me136, mul136, sh136);
__m512 add137 = _mm512_fnmadd_ps(me137, mul137, sh137);
__m512 add138 = _mm512_fnmadd_ps(me138, mul138, sh138);
__m512 add139 = _mm512_fnmadd_ps(me139, mul139, sh139);
__m512 add140 = _mm512_fnmadd_ps(me140, mul140, sh140);
__m512 lo136 = _mm512_permutex2var_ps(mul136, xlo21, add136);
__m512 lo137 = _mm512_permutex2var_ps(mul137, xlo21, add137);
__m512 lo138 = _mm512_permutex2var_ps(mul138, xlo21, add138);
__m512 lo139 = _mm512_permutex2var_ps(mul139, xlo21, add139);
__m512 lo140 = _mm512_permutex2var_ps(mul140, xlo21, add140);
__m512 hi142 = _mm512_permutex2var_ps(mul136, xhi21, add136);
__m512 hi143 = _mm512_permutex2var_ps(mul137, xhi21, add137);
__m512 hi144 = _mm512_permutex2var_ps(mul138, xhi21, add138);
__m512 hi145 = _mm512_permutex2var_ps(mul139, xhi21, add139);
__m512 hi146 = _mm512_permutex2var_ps(mul140, xhi21, add140);
_mm512_storeu_ps(mas22+(ptrdiff_t)64*0+(ptrdiff_t)640*i148, lo136);
_mm512_storeu_ps(mas22+(ptrdiff_t)64*1+(ptrdiff_t)640*i148, hi142);
_mm512_storeu_ps(mas22+(ptrdiff_t)64*2+(ptrdiff_t)640*i148, lo137);
_mm512_storeu_ps(mas22+(ptrdiff_t)64*3+(ptrdiff_t)640*i148, hi143);
_mm512_storeu_ps(mas22+(ptrdiff_t)64*4+(ptrdiff_t)640*i148, lo138);
_mm512_storeu_ps(mas22+(ptrdiff_t)64*5+(ptrdiff_t)640*i148, hi144);
_mm512_storeu_ps(mas22+(ptrdiff_t)64*6+(ptrdiff_t)640*i148, lo139);
_mm512_storeu_ps(mas22+(ptrdiff_t)64*7+(ptrdiff_t)640*i148, hi145);
_mm512_storeu_ps(mas22+(ptrdiff_t)64*8+(ptrdiff_t)640*i148, lo140);
_mm512_storeu_ps(mas22+(ptrdiff_t)64*9+(ptrdiff_t)640*i148, hi146);
}
__m512 va141 = _mm512_loadu_ps(variances21+(ptrdiff_t)16*0+(ptrdiff_t)80*8);
__m512 va142 = _mm512_loadu_ps(variances21+(ptrdiff_t)16*1+(ptrdiff_t)80*8);
__m512 va143 = _mm512_loadu_ps(variances21+(ptrdiff_t)16*2+(ptrdiff_t)80*8);
__m512 va144 = _mm512_loadu_ps(variances21+(ptrdiff_t)16*3+(ptrdiff_t)80*8);
__m512 rcp141 = DenseNet121Rsqrt1(_mm512_add_ps(eps21, va141));
__m512 rcp142 = DenseNet121Rsqrt1(_mm512_add_ps(eps21, va142));
__m512 rcp143 = DenseNet121Rsqrt1(_mm512_add_ps(eps21, va143));
__m512 rcp144 = DenseNet121Rsqrt1(_mm512_add_ps(eps21, va144));
__m512 sc141 = _mm512_loadu_ps(scales21+(ptrdiff_t)16*0+(ptrdiff_t)80*8);
__m512 sc142 = _mm512_loadu_ps(scales21+(ptrdiff_t)16*1+(ptrdiff_t)80*8);
__m512 sc143 = _mm512_loadu_ps(scales21+(ptrdiff_t)16*2+(ptrdiff_t)80*8);
__m512 sc144 = _mm512_loadu_ps(scales21+(ptrdiff_t)16*3+(ptrdiff_t)80*8);
__m512 mul141 = _mm512_mul_ps(rcp141, sc141);
__m512 mul142 = _mm512_mul_ps(rcp142, sc142);
__m512 mul143 = _mm512_mul_ps(rcp143, sc143);
__m512 mul144 = _mm512_mul_ps(rcp144, sc144);
__m512 me141 = _mm512_loadu_ps(means21+(ptrdiff_t)16*0+(ptrdiff_t)80*8);
__m512 me142 = _mm512_loadu_ps(means21+(ptrdiff_t)16*1+(ptrdiff_t)80*8);
__m512 me143 = _mm512_loadu_ps(means21+(ptrdiff_t)16*2+(ptrdiff_t)80*8);
__m512 me144 = _mm512_loadu_ps(means21+(ptrdiff_t)16*3+(ptrdiff_t)80*8);
__m512 sh141 = _mm512_loadu_ps(shifts21+(ptrdiff_t)16*0+(ptrdiff_t)80*8);
__m512 sh142 = _mm512_loadu_ps(shifts21+(ptrdiff_t)16*1+(ptrdiff_t)80*8);
__m512 sh143 = _mm512_loadu_ps(shifts21+(ptrdiff_t)16*2+(ptrdiff_t)80*8);
__m512 sh144 = _mm512_loadu_ps(shifts21+(ptrdiff_t)16*3+(ptrdiff_t)80*8);
__m512 add141 = _mm512_fnmadd_ps(me141, mul141, sh141);
__m512 add142 = _mm512_fnmadd_ps(me142, mul142, sh142);
__m512 add143 = _mm512_fnmadd_ps(me143, mul143, sh143);
__m512 add144 = _mm512_fnmadd_ps(me144, mul144, sh144);
__m512 lo141 = _mm512_permutex2var_ps(mul141, xlo21, add141);
__m512 lo142 = _mm512_permutex2var_ps(mul142, xlo21, add142);
__m512 lo143 = _mm512_permutex2var_ps(mul143, xlo21, add143);
__m512 lo144 = _mm512_permutex2var_ps(mul144, xlo21, add144);
__m512 hi147 = _mm512_permutex2var_ps(mul141, xhi21, add141);
__m512 hi148 = _mm512_permutex2var_ps(mul142, xhi21, add142);
__m512 hi149 = _mm512_permutex2var_ps(mul143, xhi21, add143);
__m512 hi150 = _mm512_permutex2var_ps(mul144, xhi21, add144);
_mm512_storeu_ps(mas22+(ptrdiff_t)64*0+(ptrdiff_t)640*8, lo141);
_mm512_storeu_ps(mas22+(ptrdiff_t)64*1+(ptrdiff_t)640*8, hi147);
_mm512_storeu_ps(mas22+(ptrdiff_t)64*2+(ptrdiff_t)640*8, lo142);
_mm512_storeu_ps(mas22+(ptrdiff_t)64*3+(ptrdiff_t)640*8, hi148);
_mm512_storeu_ps(mas22+(ptrdiff_t)64*4+(ptrdiff_t)640*8, lo143);
_mm512_storeu_ps(mas22+(ptrdiff_t)64*5+(ptrdiff_t)640*8, hi149);
_mm512_storeu_ps(mas22+(ptrdiff_t)64*6+(ptrdiff_t)640*8, lo144);
_mm512_storeu_ps(mas22+(ptrdiff_t)64*7+(ptrdiff_t)640*8, hi150);
}

static void DenseNet121BnSimplify22(
float*restrict means22,
float*restrict variances22,
float*restrict scales22,
float*restrict shifts22,
char*restrict mas23
) {
__m512 eps22 = _mm512_set1_ps(1e-05f);
__m512i xlo22 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi22 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i152 = 0; i152 < 9; ++i152) {
__m512 va145 = _mm512_loadu_ps(variances22+(ptrdiff_t)16*0+(ptrdiff_t)80*i152);
__m512 va146 = _mm512_loadu_ps(variances22+(ptrdiff_t)16*1+(ptrdiff_t)80*i152);
__m512 va147 = _mm512_loadu_ps(variances22+(ptrdiff_t)16*2+(ptrdiff_t)80*i152);
__m512 va148 = _mm512_loadu_ps(variances22+(ptrdiff_t)16*3+(ptrdiff_t)80*i152);
__m512 va149 = _mm512_loadu_ps(variances22+(ptrdiff_t)16*4+(ptrdiff_t)80*i152);
__m512 rcp145 = DenseNet121Rsqrt1(_mm512_add_ps(eps22, va145));
__m512 rcp146 = DenseNet121Rsqrt1(_mm512_add_ps(eps22, va146));
__m512 rcp147 = DenseNet121Rsqrt1(_mm512_add_ps(eps22, va147));
__m512 rcp148 = DenseNet121Rsqrt1(_mm512_add_ps(eps22, va148));
__m512 rcp149 = DenseNet121Rsqrt1(_mm512_add_ps(eps22, va149));
__m512 sc145 = _mm512_loadu_ps(scales22+(ptrdiff_t)16*0+(ptrdiff_t)80*i152);
__m512 sc146 = _mm512_loadu_ps(scales22+(ptrdiff_t)16*1+(ptrdiff_t)80*i152);
__m512 sc147 = _mm512_loadu_ps(scales22+(ptrdiff_t)16*2+(ptrdiff_t)80*i152);
__m512 sc148 = _mm512_loadu_ps(scales22+(ptrdiff_t)16*3+(ptrdiff_t)80*i152);
__m512 sc149 = _mm512_loadu_ps(scales22+(ptrdiff_t)16*4+(ptrdiff_t)80*i152);
__m512 mul145 = _mm512_mul_ps(rcp145, sc145);
__m512 mul146 = _mm512_mul_ps(rcp146, sc146);
__m512 mul147 = _mm512_mul_ps(rcp147, sc147);
__m512 mul148 = _mm512_mul_ps(rcp148, sc148);
__m512 mul149 = _mm512_mul_ps(rcp149, sc149);
__m512 me145 = _mm512_loadu_ps(means22+(ptrdiff_t)16*0+(ptrdiff_t)80*i152);
__m512 me146 = _mm512_loadu_ps(means22+(ptrdiff_t)16*1+(ptrdiff_t)80*i152);
__m512 me147 = _mm512_loadu_ps(means22+(ptrdiff_t)16*2+(ptrdiff_t)80*i152);
__m512 me148 = _mm512_loadu_ps(means22+(ptrdiff_t)16*3+(ptrdiff_t)80*i152);
__m512 me149 = _mm512_loadu_ps(means22+(ptrdiff_t)16*4+(ptrdiff_t)80*i152);
__m512 sh145 = _mm512_loadu_ps(shifts22+(ptrdiff_t)16*0+(ptrdiff_t)80*i152);
__m512 sh146 = _mm512_loadu_ps(shifts22+(ptrdiff_t)16*1+(ptrdiff_t)80*i152);
__m512 sh147 = _mm512_loadu_ps(shifts22+(ptrdiff_t)16*2+(ptrdiff_t)80*i152);
__m512 sh148 = _mm512_loadu_ps(shifts22+(ptrdiff_t)16*3+(ptrdiff_t)80*i152);
__m512 sh149 = _mm512_loadu_ps(shifts22+(ptrdiff_t)16*4+(ptrdiff_t)80*i152);
__m512 add145 = _mm512_fnmadd_ps(me145, mul145, sh145);
__m512 add146 = _mm512_fnmadd_ps(me146, mul146, sh146);
__m512 add147 = _mm512_fnmadd_ps(me147, mul147, sh147);
__m512 add148 = _mm512_fnmadd_ps(me148, mul148, sh148);
__m512 add149 = _mm512_fnmadd_ps(me149, mul149, sh149);
__m512 lo145 = _mm512_permutex2var_ps(mul145, xlo22, add145);
__m512 lo146 = _mm512_permutex2var_ps(mul146, xlo22, add146);
__m512 lo147 = _mm512_permutex2var_ps(mul147, xlo22, add147);
__m512 lo148 = _mm512_permutex2var_ps(mul148, xlo22, add148);
__m512 lo149 = _mm512_permutex2var_ps(mul149, xlo22, add149);
__m512 hi151 = _mm512_permutex2var_ps(mul145, xhi22, add145);
__m512 hi152 = _mm512_permutex2var_ps(mul146, xhi22, add146);
__m512 hi153 = _mm512_permutex2var_ps(mul147, xhi22, add147);
__m512 hi154 = _mm512_permutex2var_ps(mul148, xhi22, add148);
__m512 hi155 = _mm512_permutex2var_ps(mul149, xhi22, add149);
_mm512_storeu_ps(mas23+(ptrdiff_t)64*0+(ptrdiff_t)640*i152, lo145);
_mm512_storeu_ps(mas23+(ptrdiff_t)64*1+(ptrdiff_t)640*i152, hi151);
_mm512_storeu_ps(mas23+(ptrdiff_t)64*2+(ptrdiff_t)640*i152, lo146);
_mm512_storeu_ps(mas23+(ptrdiff_t)64*3+(ptrdiff_t)640*i152, hi152);
_mm512_storeu_ps(mas23+(ptrdiff_t)64*4+(ptrdiff_t)640*i152, lo147);
_mm512_storeu_ps(mas23+(ptrdiff_t)64*5+(ptrdiff_t)640*i152, hi153);
_mm512_storeu_ps(mas23+(ptrdiff_t)64*6+(ptrdiff_t)640*i152, lo148);
_mm512_storeu_ps(mas23+(ptrdiff_t)64*7+(ptrdiff_t)640*i152, hi154);
_mm512_storeu_ps(mas23+(ptrdiff_t)64*8+(ptrdiff_t)640*i152, lo149);
_mm512_storeu_ps(mas23+(ptrdiff_t)64*9+(ptrdiff_t)640*i152, hi155);
}
__m512 va150 = _mm512_loadu_ps(variances22+(ptrdiff_t)16*0+(ptrdiff_t)80*9);
__m512 rcp150 = DenseNet121Rsqrt1(_mm512_add_ps(eps22, va150));
__m512 sc150 = _mm512_loadu_ps(scales22+(ptrdiff_t)16*0+(ptrdiff_t)80*9);
__m512 mul150 = _mm512_mul_ps(rcp150, sc150);
__m512 me150 = _mm512_loadu_ps(means22+(ptrdiff_t)16*0+(ptrdiff_t)80*9);
__m512 sh150 = _mm512_loadu_ps(shifts22+(ptrdiff_t)16*0+(ptrdiff_t)80*9);
__m512 add150 = _mm512_fnmadd_ps(me150, mul150, sh150);
__m512 lo150 = _mm512_permutex2var_ps(mul150, xlo22, add150);
__m512 hi156 = _mm512_permutex2var_ps(mul150, xhi22, add150);
_mm512_storeu_ps(mas23+(ptrdiff_t)64*0+(ptrdiff_t)640*9, lo150);
_mm512_storeu_ps(mas23+(ptrdiff_t)64*1+(ptrdiff_t)640*9, hi156);
}

static void DenseNet121BnSimplify23(
float*restrict means23,
float*restrict variances23,
float*restrict scales23,
float*restrict shifts23,
char*restrict mas24
) {
__m512 eps23 = _mm512_set1_ps(1e-05f);
__m512i xlo23 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi23 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i156 = 0; i156 < 9; ++i156) {
__m512 va151 = _mm512_loadu_ps(variances23+(ptrdiff_t)16*0+(ptrdiff_t)80*i156);
__m512 va152 = _mm512_loadu_ps(variances23+(ptrdiff_t)16*1+(ptrdiff_t)80*i156);
__m512 va153 = _mm512_loadu_ps(variances23+(ptrdiff_t)16*2+(ptrdiff_t)80*i156);
__m512 va154 = _mm512_loadu_ps(variances23+(ptrdiff_t)16*3+(ptrdiff_t)80*i156);
__m512 va155 = _mm512_loadu_ps(variances23+(ptrdiff_t)16*4+(ptrdiff_t)80*i156);
__m512 rcp151 = DenseNet121Rsqrt1(_mm512_add_ps(eps23, va151));
__m512 rcp152 = DenseNet121Rsqrt1(_mm512_add_ps(eps23, va152));
__m512 rcp153 = DenseNet121Rsqrt1(_mm512_add_ps(eps23, va153));
__m512 rcp154 = DenseNet121Rsqrt1(_mm512_add_ps(eps23, va154));
__m512 rcp155 = DenseNet121Rsqrt1(_mm512_add_ps(eps23, va155));
__m512 sc151 = _mm512_loadu_ps(scales23+(ptrdiff_t)16*0+(ptrdiff_t)80*i156);
__m512 sc152 = _mm512_loadu_ps(scales23+(ptrdiff_t)16*1+(ptrdiff_t)80*i156);
__m512 sc153 = _mm512_loadu_ps(scales23+(ptrdiff_t)16*2+(ptrdiff_t)80*i156);
__m512 sc154 = _mm512_loadu_ps(scales23+(ptrdiff_t)16*3+(ptrdiff_t)80*i156);
__m512 sc155 = _mm512_loadu_ps(scales23+(ptrdiff_t)16*4+(ptrdiff_t)80*i156);
__m512 mul151 = _mm512_mul_ps(rcp151, sc151);
__m512 mul152 = _mm512_mul_ps(rcp152, sc152);
__m512 mul153 = _mm512_mul_ps(rcp153, sc153);
__m512 mul154 = _mm512_mul_ps(rcp154, sc154);
__m512 mul155 = _mm512_mul_ps(rcp155, sc155);
__m512 me151 = _mm512_loadu_ps(means23+(ptrdiff_t)16*0+(ptrdiff_t)80*i156);
__m512 me152 = _mm512_loadu_ps(means23+(ptrdiff_t)16*1+(ptrdiff_t)80*i156);
__m512 me153 = _mm512_loadu_ps(means23+(ptrdiff_t)16*2+(ptrdiff_t)80*i156);
__m512 me154 = _mm512_loadu_ps(means23+(ptrdiff_t)16*3+(ptrdiff_t)80*i156);
__m512 me155 = _mm512_loadu_ps(means23+(ptrdiff_t)16*4+(ptrdiff_t)80*i156);
__m512 sh151 = _mm512_loadu_ps(shifts23+(ptrdiff_t)16*0+(ptrdiff_t)80*i156);
__m512 sh152 = _mm512_loadu_ps(shifts23+(ptrdiff_t)16*1+(ptrdiff_t)80*i156);
__m512 sh153 = _mm512_loadu_ps(shifts23+(ptrdiff_t)16*2+(ptrdiff_t)80*i156);
__m512 sh154 = _mm512_loadu_ps(shifts23+(ptrdiff_t)16*3+(ptrdiff_t)80*i156);
__m512 sh155 = _mm512_loadu_ps(shifts23+(ptrdiff_t)16*4+(ptrdiff_t)80*i156);
__m512 add151 = _mm512_fnmadd_ps(me151, mul151, sh151);
__m512 add152 = _mm512_fnmadd_ps(me152, mul152, sh152);
__m512 add153 = _mm512_fnmadd_ps(me153, mul153, sh153);
__m512 add154 = _mm512_fnmadd_ps(me154, mul154, sh154);
__m512 add155 = _mm512_fnmadd_ps(me155, mul155, sh155);
__m512 lo151 = _mm512_permutex2var_ps(mul151, xlo23, add151);
__m512 lo152 = _mm512_permutex2var_ps(mul152, xlo23, add152);
__m512 lo153 = _mm512_permutex2var_ps(mul153, xlo23, add153);
__m512 lo154 = _mm512_permutex2var_ps(mul154, xlo23, add154);
__m512 lo155 = _mm512_permutex2var_ps(mul155, xlo23, add155);
__m512 hi157 = _mm512_permutex2var_ps(mul151, xhi23, add151);
__m512 hi158 = _mm512_permutex2var_ps(mul152, xhi23, add152);
__m512 hi159 = _mm512_permutex2var_ps(mul153, xhi23, add153);
__m512 hi160 = _mm512_permutex2var_ps(mul154, xhi23, add154);
__m512 hi161 = _mm512_permutex2var_ps(mul155, xhi23, add155);
_mm512_storeu_ps(mas24+(ptrdiff_t)64*0+(ptrdiff_t)640*i156, lo151);
_mm512_storeu_ps(mas24+(ptrdiff_t)64*1+(ptrdiff_t)640*i156, hi157);
_mm512_storeu_ps(mas24+(ptrdiff_t)64*2+(ptrdiff_t)640*i156, lo152);
_mm512_storeu_ps(mas24+(ptrdiff_t)64*3+(ptrdiff_t)640*i156, hi158);
_mm512_storeu_ps(mas24+(ptrdiff_t)64*4+(ptrdiff_t)640*i156, lo153);
_mm512_storeu_ps(mas24+(ptrdiff_t)64*5+(ptrdiff_t)640*i156, hi159);
_mm512_storeu_ps(mas24+(ptrdiff_t)64*6+(ptrdiff_t)640*i156, lo154);
_mm512_storeu_ps(mas24+(ptrdiff_t)64*7+(ptrdiff_t)640*i156, hi160);
_mm512_storeu_ps(mas24+(ptrdiff_t)64*8+(ptrdiff_t)640*i156, lo155);
_mm512_storeu_ps(mas24+(ptrdiff_t)64*9+(ptrdiff_t)640*i156, hi161);
}
__m512 va156 = _mm512_loadu_ps(variances23+(ptrdiff_t)16*0+(ptrdiff_t)80*9);
__m512 va157 = _mm512_loadu_ps(variances23+(ptrdiff_t)16*1+(ptrdiff_t)80*9);
__m512 va158 = _mm512_loadu_ps(variances23+(ptrdiff_t)16*2+(ptrdiff_t)80*9);
__m512 rcp156 = DenseNet121Rsqrt1(_mm512_add_ps(eps23, va156));
__m512 rcp157 = DenseNet121Rsqrt1(_mm512_add_ps(eps23, va157));
__m512 rcp158 = DenseNet121Rsqrt1(_mm512_add_ps(eps23, va158));
__m512 sc156 = _mm512_loadu_ps(scales23+(ptrdiff_t)16*0+(ptrdiff_t)80*9);
__m512 sc157 = _mm512_loadu_ps(scales23+(ptrdiff_t)16*1+(ptrdiff_t)80*9);
__m512 sc158 = _mm512_loadu_ps(scales23+(ptrdiff_t)16*2+(ptrdiff_t)80*9);
__m512 mul156 = _mm512_mul_ps(rcp156, sc156);
__m512 mul157 = _mm512_mul_ps(rcp157, sc157);
__m512 mul158 = _mm512_mul_ps(rcp158, sc158);
__m512 me156 = _mm512_loadu_ps(means23+(ptrdiff_t)16*0+(ptrdiff_t)80*9);
__m512 me157 = _mm512_loadu_ps(means23+(ptrdiff_t)16*1+(ptrdiff_t)80*9);
__m512 me158 = _mm512_loadu_ps(means23+(ptrdiff_t)16*2+(ptrdiff_t)80*9);
__m512 sh156 = _mm512_loadu_ps(shifts23+(ptrdiff_t)16*0+(ptrdiff_t)80*9);
__m512 sh157 = _mm512_loadu_ps(shifts23+(ptrdiff_t)16*1+(ptrdiff_t)80*9);
__m512 sh158 = _mm512_loadu_ps(shifts23+(ptrdiff_t)16*2+(ptrdiff_t)80*9);
__m512 add156 = _mm512_fnmadd_ps(me156, mul156, sh156);
__m512 add157 = _mm512_fnmadd_ps(me157, mul157, sh157);
__m512 add158 = _mm512_fnmadd_ps(me158, mul158, sh158);
__m512 lo156 = _mm512_permutex2var_ps(mul156, xlo23, add156);
__m512 lo157 = _mm512_permutex2var_ps(mul157, xlo23, add157);
__m512 lo158 = _mm512_permutex2var_ps(mul158, xlo23, add158);
__m512 hi162 = _mm512_permutex2var_ps(mul156, xhi23, add156);
__m512 hi163 = _mm512_permutex2var_ps(mul157, xhi23, add157);
__m512 hi164 = _mm512_permutex2var_ps(mul158, xhi23, add158);
_mm512_storeu_ps(mas24+(ptrdiff_t)64*0+(ptrdiff_t)640*9, lo156);
_mm512_storeu_ps(mas24+(ptrdiff_t)64*1+(ptrdiff_t)640*9, hi162);
_mm512_storeu_ps(mas24+(ptrdiff_t)64*2+(ptrdiff_t)640*9, lo157);
_mm512_storeu_ps(mas24+(ptrdiff_t)64*3+(ptrdiff_t)640*9, hi163);
_mm512_storeu_ps(mas24+(ptrdiff_t)64*4+(ptrdiff_t)640*9, lo158);
_mm512_storeu_ps(mas24+(ptrdiff_t)64*5+(ptrdiff_t)640*9, hi164);
}

static void DenseNet121BnSimplify24(
float*restrict means24,
float*restrict variances24,
float*restrict scales24,
float*restrict shifts24,
char*restrict mas25
) {
__m512 eps24 = _mm512_set1_ps(1e-05f);
__m512i xlo24 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi24 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i160 = 0; i160 < 10; ++i160) {
__m512 va159 = _mm512_loadu_ps(variances24+(ptrdiff_t)16*0+(ptrdiff_t)80*i160);
__m512 va160 = _mm512_loadu_ps(variances24+(ptrdiff_t)16*1+(ptrdiff_t)80*i160);
__m512 va161 = _mm512_loadu_ps(variances24+(ptrdiff_t)16*2+(ptrdiff_t)80*i160);
__m512 va162 = _mm512_loadu_ps(variances24+(ptrdiff_t)16*3+(ptrdiff_t)80*i160);
__m512 va163 = _mm512_loadu_ps(variances24+(ptrdiff_t)16*4+(ptrdiff_t)80*i160);
__m512 rcp159 = DenseNet121Rsqrt1(_mm512_add_ps(eps24, va159));
__m512 rcp160 = DenseNet121Rsqrt1(_mm512_add_ps(eps24, va160));
__m512 rcp161 = DenseNet121Rsqrt1(_mm512_add_ps(eps24, va161));
__m512 rcp162 = DenseNet121Rsqrt1(_mm512_add_ps(eps24, va162));
__m512 rcp163 = DenseNet121Rsqrt1(_mm512_add_ps(eps24, va163));
__m512 sc159 = _mm512_loadu_ps(scales24+(ptrdiff_t)16*0+(ptrdiff_t)80*i160);
__m512 sc160 = _mm512_loadu_ps(scales24+(ptrdiff_t)16*1+(ptrdiff_t)80*i160);
__m512 sc161 = _mm512_loadu_ps(scales24+(ptrdiff_t)16*2+(ptrdiff_t)80*i160);
__m512 sc162 = _mm512_loadu_ps(scales24+(ptrdiff_t)16*3+(ptrdiff_t)80*i160);
__m512 sc163 = _mm512_loadu_ps(scales24+(ptrdiff_t)16*4+(ptrdiff_t)80*i160);
__m512 mul159 = _mm512_mul_ps(rcp159, sc159);
__m512 mul160 = _mm512_mul_ps(rcp160, sc160);
__m512 mul161 = _mm512_mul_ps(rcp161, sc161);
__m512 mul162 = _mm512_mul_ps(rcp162, sc162);
__m512 mul163 = _mm512_mul_ps(rcp163, sc163);
__m512 me159 = _mm512_loadu_ps(means24+(ptrdiff_t)16*0+(ptrdiff_t)80*i160);
__m512 me160 = _mm512_loadu_ps(means24+(ptrdiff_t)16*1+(ptrdiff_t)80*i160);
__m512 me161 = _mm512_loadu_ps(means24+(ptrdiff_t)16*2+(ptrdiff_t)80*i160);
__m512 me162 = _mm512_loadu_ps(means24+(ptrdiff_t)16*3+(ptrdiff_t)80*i160);
__m512 me163 = _mm512_loadu_ps(means24+(ptrdiff_t)16*4+(ptrdiff_t)80*i160);
__m512 sh159 = _mm512_loadu_ps(shifts24+(ptrdiff_t)16*0+(ptrdiff_t)80*i160);
__m512 sh160 = _mm512_loadu_ps(shifts24+(ptrdiff_t)16*1+(ptrdiff_t)80*i160);
__m512 sh161 = _mm512_loadu_ps(shifts24+(ptrdiff_t)16*2+(ptrdiff_t)80*i160);
__m512 sh162 = _mm512_loadu_ps(shifts24+(ptrdiff_t)16*3+(ptrdiff_t)80*i160);
__m512 sh163 = _mm512_loadu_ps(shifts24+(ptrdiff_t)16*4+(ptrdiff_t)80*i160);
__m512 add159 = _mm512_fnmadd_ps(me159, mul159, sh159);
__m512 add160 = _mm512_fnmadd_ps(me160, mul160, sh160);
__m512 add161 = _mm512_fnmadd_ps(me161, mul161, sh161);
__m512 add162 = _mm512_fnmadd_ps(me162, mul162, sh162);
__m512 add163 = _mm512_fnmadd_ps(me163, mul163, sh163);
__m512 lo159 = _mm512_permutex2var_ps(mul159, xlo24, add159);
__m512 lo160 = _mm512_permutex2var_ps(mul160, xlo24, add160);
__m512 lo161 = _mm512_permutex2var_ps(mul161, xlo24, add161);
__m512 lo162 = _mm512_permutex2var_ps(mul162, xlo24, add162);
__m512 lo163 = _mm512_permutex2var_ps(mul163, xlo24, add163);
__m512 hi165 = _mm512_permutex2var_ps(mul159, xhi24, add159);
__m512 hi166 = _mm512_permutex2var_ps(mul160, xhi24, add160);
__m512 hi167 = _mm512_permutex2var_ps(mul161, xhi24, add161);
__m512 hi168 = _mm512_permutex2var_ps(mul162, xhi24, add162);
__m512 hi169 = _mm512_permutex2var_ps(mul163, xhi24, add163);
_mm512_storeu_ps(mas25+(ptrdiff_t)64*0+(ptrdiff_t)640*i160, lo159);
_mm512_storeu_ps(mas25+(ptrdiff_t)64*1+(ptrdiff_t)640*i160, hi165);
_mm512_storeu_ps(mas25+(ptrdiff_t)64*2+(ptrdiff_t)640*i160, lo160);
_mm512_storeu_ps(mas25+(ptrdiff_t)64*3+(ptrdiff_t)640*i160, hi166);
_mm512_storeu_ps(mas25+(ptrdiff_t)64*4+(ptrdiff_t)640*i160, lo161);
_mm512_storeu_ps(mas25+(ptrdiff_t)64*5+(ptrdiff_t)640*i160, hi167);
_mm512_storeu_ps(mas25+(ptrdiff_t)64*6+(ptrdiff_t)640*i160, lo162);
_mm512_storeu_ps(mas25+(ptrdiff_t)64*7+(ptrdiff_t)640*i160, hi168);
_mm512_storeu_ps(mas25+(ptrdiff_t)64*8+(ptrdiff_t)640*i160, lo163);
_mm512_storeu_ps(mas25+(ptrdiff_t)64*9+(ptrdiff_t)640*i160, hi169);
}
}

static void DenseNet121BnSimplify25(
float*restrict means25,
float*restrict variances25,
float*restrict scales25,
float*restrict shifts25,
char*restrict mas26
) {
__m512 eps25 = _mm512_set1_ps(1e-05f);
__m512i xlo25 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi25 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i164 = 0; i164 < 10; ++i164) {
__m512 va164 = _mm512_loadu_ps(variances25+(ptrdiff_t)16*0+(ptrdiff_t)80*i164);
__m512 va165 = _mm512_loadu_ps(variances25+(ptrdiff_t)16*1+(ptrdiff_t)80*i164);
__m512 va166 = _mm512_loadu_ps(variances25+(ptrdiff_t)16*2+(ptrdiff_t)80*i164);
__m512 va167 = _mm512_loadu_ps(variances25+(ptrdiff_t)16*3+(ptrdiff_t)80*i164);
__m512 va168 = _mm512_loadu_ps(variances25+(ptrdiff_t)16*4+(ptrdiff_t)80*i164);
__m512 rcp164 = DenseNet121Rsqrt1(_mm512_add_ps(eps25, va164));
__m512 rcp165 = DenseNet121Rsqrt1(_mm512_add_ps(eps25, va165));
__m512 rcp166 = DenseNet121Rsqrt1(_mm512_add_ps(eps25, va166));
__m512 rcp167 = DenseNet121Rsqrt1(_mm512_add_ps(eps25, va167));
__m512 rcp168 = DenseNet121Rsqrt1(_mm512_add_ps(eps25, va168));
__m512 sc164 = _mm512_loadu_ps(scales25+(ptrdiff_t)16*0+(ptrdiff_t)80*i164);
__m512 sc165 = _mm512_loadu_ps(scales25+(ptrdiff_t)16*1+(ptrdiff_t)80*i164);
__m512 sc166 = _mm512_loadu_ps(scales25+(ptrdiff_t)16*2+(ptrdiff_t)80*i164);
__m512 sc167 = _mm512_loadu_ps(scales25+(ptrdiff_t)16*3+(ptrdiff_t)80*i164);
__m512 sc168 = _mm512_loadu_ps(scales25+(ptrdiff_t)16*4+(ptrdiff_t)80*i164);
__m512 mul164 = _mm512_mul_ps(rcp164, sc164);
__m512 mul165 = _mm512_mul_ps(rcp165, sc165);
__m512 mul166 = _mm512_mul_ps(rcp166, sc166);
__m512 mul167 = _mm512_mul_ps(rcp167, sc167);
__m512 mul168 = _mm512_mul_ps(rcp168, sc168);
__m512 me164 = _mm512_loadu_ps(means25+(ptrdiff_t)16*0+(ptrdiff_t)80*i164);
__m512 me165 = _mm512_loadu_ps(means25+(ptrdiff_t)16*1+(ptrdiff_t)80*i164);
__m512 me166 = _mm512_loadu_ps(means25+(ptrdiff_t)16*2+(ptrdiff_t)80*i164);
__m512 me167 = _mm512_loadu_ps(means25+(ptrdiff_t)16*3+(ptrdiff_t)80*i164);
__m512 me168 = _mm512_loadu_ps(means25+(ptrdiff_t)16*4+(ptrdiff_t)80*i164);
__m512 sh164 = _mm512_loadu_ps(shifts25+(ptrdiff_t)16*0+(ptrdiff_t)80*i164);
__m512 sh165 = _mm512_loadu_ps(shifts25+(ptrdiff_t)16*1+(ptrdiff_t)80*i164);
__m512 sh166 = _mm512_loadu_ps(shifts25+(ptrdiff_t)16*2+(ptrdiff_t)80*i164);
__m512 sh167 = _mm512_loadu_ps(shifts25+(ptrdiff_t)16*3+(ptrdiff_t)80*i164);
__m512 sh168 = _mm512_loadu_ps(shifts25+(ptrdiff_t)16*4+(ptrdiff_t)80*i164);
__m512 add164 = _mm512_fnmadd_ps(me164, mul164, sh164);
__m512 add165 = _mm512_fnmadd_ps(me165, mul165, sh165);
__m512 add166 = _mm512_fnmadd_ps(me166, mul166, sh166);
__m512 add167 = _mm512_fnmadd_ps(me167, mul167, sh167);
__m512 add168 = _mm512_fnmadd_ps(me168, mul168, sh168);
__m512 lo164 = _mm512_permutex2var_ps(mul164, xlo25, add164);
__m512 lo165 = _mm512_permutex2var_ps(mul165, xlo25, add165);
__m512 lo166 = _mm512_permutex2var_ps(mul166, xlo25, add166);
__m512 lo167 = _mm512_permutex2var_ps(mul167, xlo25, add167);
__m512 lo168 = _mm512_permutex2var_ps(mul168, xlo25, add168);
__m512 hi170 = _mm512_permutex2var_ps(mul164, xhi25, add164);
__m512 hi171 = _mm512_permutex2var_ps(mul165, xhi25, add165);
__m512 hi172 = _mm512_permutex2var_ps(mul166, xhi25, add166);
__m512 hi173 = _mm512_permutex2var_ps(mul167, xhi25, add167);
__m512 hi174 = _mm512_permutex2var_ps(mul168, xhi25, add168);
_mm512_storeu_ps(mas26+(ptrdiff_t)64*0+(ptrdiff_t)640*i164, lo164);
_mm512_storeu_ps(mas26+(ptrdiff_t)64*1+(ptrdiff_t)640*i164, hi170);
_mm512_storeu_ps(mas26+(ptrdiff_t)64*2+(ptrdiff_t)640*i164, lo165);
_mm512_storeu_ps(mas26+(ptrdiff_t)64*3+(ptrdiff_t)640*i164, hi171);
_mm512_storeu_ps(mas26+(ptrdiff_t)64*4+(ptrdiff_t)640*i164, lo166);
_mm512_storeu_ps(mas26+(ptrdiff_t)64*5+(ptrdiff_t)640*i164, hi172);
_mm512_storeu_ps(mas26+(ptrdiff_t)64*6+(ptrdiff_t)640*i164, lo167);
_mm512_storeu_ps(mas26+(ptrdiff_t)64*7+(ptrdiff_t)640*i164, hi173);
_mm512_storeu_ps(mas26+(ptrdiff_t)64*8+(ptrdiff_t)640*i164, lo168);
_mm512_storeu_ps(mas26+(ptrdiff_t)64*9+(ptrdiff_t)640*i164, hi174);
}
__m512 va169 = _mm512_loadu_ps(variances25+(ptrdiff_t)16*0+(ptrdiff_t)80*10);
__m512 va170 = _mm512_loadu_ps(variances25+(ptrdiff_t)16*1+(ptrdiff_t)80*10);
__m512 rcp169 = DenseNet121Rsqrt1(_mm512_add_ps(eps25, va169));
__m512 rcp170 = DenseNet121Rsqrt1(_mm512_add_ps(eps25, va170));
__m512 sc169 = _mm512_loadu_ps(scales25+(ptrdiff_t)16*0+(ptrdiff_t)80*10);
__m512 sc170 = _mm512_loadu_ps(scales25+(ptrdiff_t)16*1+(ptrdiff_t)80*10);
__m512 mul169 = _mm512_mul_ps(rcp169, sc169);
__m512 mul170 = _mm512_mul_ps(rcp170, sc170);
__m512 me169 = _mm512_loadu_ps(means25+(ptrdiff_t)16*0+(ptrdiff_t)80*10);
__m512 me170 = _mm512_loadu_ps(means25+(ptrdiff_t)16*1+(ptrdiff_t)80*10);
__m512 sh169 = _mm512_loadu_ps(shifts25+(ptrdiff_t)16*0+(ptrdiff_t)80*10);
__m512 sh170 = _mm512_loadu_ps(shifts25+(ptrdiff_t)16*1+(ptrdiff_t)80*10);
__m512 add169 = _mm512_fnmadd_ps(me169, mul169, sh169);
__m512 add170 = _mm512_fnmadd_ps(me170, mul170, sh170);
__m512 lo169 = _mm512_permutex2var_ps(mul169, xlo25, add169);
__m512 lo170 = _mm512_permutex2var_ps(mul170, xlo25, add170);
__m512 hi175 = _mm512_permutex2var_ps(mul169, xhi25, add169);
__m512 hi176 = _mm512_permutex2var_ps(mul170, xhi25, add170);
_mm512_storeu_ps(mas26+(ptrdiff_t)64*0+(ptrdiff_t)640*10, lo169);
_mm512_storeu_ps(mas26+(ptrdiff_t)64*1+(ptrdiff_t)640*10, hi175);
_mm512_storeu_ps(mas26+(ptrdiff_t)64*2+(ptrdiff_t)640*10, lo170);
_mm512_storeu_ps(mas26+(ptrdiff_t)64*3+(ptrdiff_t)640*10, hi176);
}

static void DenseNet121BnSimplify26(
float*restrict means26,
float*restrict variances26,
float*restrict scales26,
float*restrict shifts26,
char*restrict mas27
) {
__m512 eps26 = _mm512_set1_ps(1e-05f);
__m512i xlo26 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi26 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i168 = 0; i168 < 10; ++i168) {
__m512 va171 = _mm512_loadu_ps(variances26+(ptrdiff_t)16*0+(ptrdiff_t)80*i168);
__m512 va172 = _mm512_loadu_ps(variances26+(ptrdiff_t)16*1+(ptrdiff_t)80*i168);
__m512 va173 = _mm512_loadu_ps(variances26+(ptrdiff_t)16*2+(ptrdiff_t)80*i168);
__m512 va174 = _mm512_loadu_ps(variances26+(ptrdiff_t)16*3+(ptrdiff_t)80*i168);
__m512 va175 = _mm512_loadu_ps(variances26+(ptrdiff_t)16*4+(ptrdiff_t)80*i168);
__m512 rcp171 = DenseNet121Rsqrt1(_mm512_add_ps(eps26, va171));
__m512 rcp172 = DenseNet121Rsqrt1(_mm512_add_ps(eps26, va172));
__m512 rcp173 = DenseNet121Rsqrt1(_mm512_add_ps(eps26, va173));
__m512 rcp174 = DenseNet121Rsqrt1(_mm512_add_ps(eps26, va174));
__m512 rcp175 = DenseNet121Rsqrt1(_mm512_add_ps(eps26, va175));
__m512 sc171 = _mm512_loadu_ps(scales26+(ptrdiff_t)16*0+(ptrdiff_t)80*i168);
__m512 sc172 = _mm512_loadu_ps(scales26+(ptrdiff_t)16*1+(ptrdiff_t)80*i168);
__m512 sc173 = _mm512_loadu_ps(scales26+(ptrdiff_t)16*2+(ptrdiff_t)80*i168);
__m512 sc174 = _mm512_loadu_ps(scales26+(ptrdiff_t)16*3+(ptrdiff_t)80*i168);
__m512 sc175 = _mm512_loadu_ps(scales26+(ptrdiff_t)16*4+(ptrdiff_t)80*i168);
__m512 mul171 = _mm512_mul_ps(rcp171, sc171);
__m512 mul172 = _mm512_mul_ps(rcp172, sc172);
__m512 mul173 = _mm512_mul_ps(rcp173, sc173);
__m512 mul174 = _mm512_mul_ps(rcp174, sc174);
__m512 mul175 = _mm512_mul_ps(rcp175, sc175);
__m512 me171 = _mm512_loadu_ps(means26+(ptrdiff_t)16*0+(ptrdiff_t)80*i168);
__m512 me172 = _mm512_loadu_ps(means26+(ptrdiff_t)16*1+(ptrdiff_t)80*i168);
__m512 me173 = _mm512_loadu_ps(means26+(ptrdiff_t)16*2+(ptrdiff_t)80*i168);
__m512 me174 = _mm512_loadu_ps(means26+(ptrdiff_t)16*3+(ptrdiff_t)80*i168);
__m512 me175 = _mm512_loadu_ps(means26+(ptrdiff_t)16*4+(ptrdiff_t)80*i168);
__m512 sh171 = _mm512_loadu_ps(shifts26+(ptrdiff_t)16*0+(ptrdiff_t)80*i168);
__m512 sh172 = _mm512_loadu_ps(shifts26+(ptrdiff_t)16*1+(ptrdiff_t)80*i168);
__m512 sh173 = _mm512_loadu_ps(shifts26+(ptrdiff_t)16*2+(ptrdiff_t)80*i168);
__m512 sh174 = _mm512_loadu_ps(shifts26+(ptrdiff_t)16*3+(ptrdiff_t)80*i168);
__m512 sh175 = _mm512_loadu_ps(shifts26+(ptrdiff_t)16*4+(ptrdiff_t)80*i168);
__m512 add171 = _mm512_fnmadd_ps(me171, mul171, sh171);
__m512 add172 = _mm512_fnmadd_ps(me172, mul172, sh172);
__m512 add173 = _mm512_fnmadd_ps(me173, mul173, sh173);
__m512 add174 = _mm512_fnmadd_ps(me174, mul174, sh174);
__m512 add175 = _mm512_fnmadd_ps(me175, mul175, sh175);
__m512 lo171 = _mm512_permutex2var_ps(mul171, xlo26, add171);
__m512 lo172 = _mm512_permutex2var_ps(mul172, xlo26, add172);
__m512 lo173 = _mm512_permutex2var_ps(mul173, xlo26, add173);
__m512 lo174 = _mm512_permutex2var_ps(mul174, xlo26, add174);
__m512 lo175 = _mm512_permutex2var_ps(mul175, xlo26, add175);
__m512 hi177 = _mm512_permutex2var_ps(mul171, xhi26, add171);
__m512 hi178 = _mm512_permutex2var_ps(mul172, xhi26, add172);
__m512 hi179 = _mm512_permutex2var_ps(mul173, xhi26, add173);
__m512 hi180 = _mm512_permutex2var_ps(mul174, xhi26, add174);
__m512 hi181 = _mm512_permutex2var_ps(mul175, xhi26, add175);
_mm512_storeu_ps(mas27+(ptrdiff_t)64*0+(ptrdiff_t)640*i168, lo171);
_mm512_storeu_ps(mas27+(ptrdiff_t)64*1+(ptrdiff_t)640*i168, hi177);
_mm512_storeu_ps(mas27+(ptrdiff_t)64*2+(ptrdiff_t)640*i168, lo172);
_mm512_storeu_ps(mas27+(ptrdiff_t)64*3+(ptrdiff_t)640*i168, hi178);
_mm512_storeu_ps(mas27+(ptrdiff_t)64*4+(ptrdiff_t)640*i168, lo173);
_mm512_storeu_ps(mas27+(ptrdiff_t)64*5+(ptrdiff_t)640*i168, hi179);
_mm512_storeu_ps(mas27+(ptrdiff_t)64*6+(ptrdiff_t)640*i168, lo174);
_mm512_storeu_ps(mas27+(ptrdiff_t)64*7+(ptrdiff_t)640*i168, hi180);
_mm512_storeu_ps(mas27+(ptrdiff_t)64*8+(ptrdiff_t)640*i168, lo175);
_mm512_storeu_ps(mas27+(ptrdiff_t)64*9+(ptrdiff_t)640*i168, hi181);
}
__m512 va176 = _mm512_loadu_ps(variances26+(ptrdiff_t)16*0+(ptrdiff_t)80*10);
__m512 va177 = _mm512_loadu_ps(variances26+(ptrdiff_t)16*1+(ptrdiff_t)80*10);
__m512 va178 = _mm512_loadu_ps(variances26+(ptrdiff_t)16*2+(ptrdiff_t)80*10);
__m512 va179 = _mm512_loadu_ps(variances26+(ptrdiff_t)16*3+(ptrdiff_t)80*10);
__m512 rcp176 = DenseNet121Rsqrt1(_mm512_add_ps(eps26, va176));
__m512 rcp177 = DenseNet121Rsqrt1(_mm512_add_ps(eps26, va177));
__m512 rcp178 = DenseNet121Rsqrt1(_mm512_add_ps(eps26, va178));
__m512 rcp179 = DenseNet121Rsqrt1(_mm512_add_ps(eps26, va179));
__m512 sc176 = _mm512_loadu_ps(scales26+(ptrdiff_t)16*0+(ptrdiff_t)80*10);
__m512 sc177 = _mm512_loadu_ps(scales26+(ptrdiff_t)16*1+(ptrdiff_t)80*10);
__m512 sc178 = _mm512_loadu_ps(scales26+(ptrdiff_t)16*2+(ptrdiff_t)80*10);
__m512 sc179 = _mm512_loadu_ps(scales26+(ptrdiff_t)16*3+(ptrdiff_t)80*10);
__m512 mul176 = _mm512_mul_ps(rcp176, sc176);
__m512 mul177 = _mm512_mul_ps(rcp177, sc177);
__m512 mul178 = _mm512_mul_ps(rcp178, sc178);
__m512 mul179 = _mm512_mul_ps(rcp179, sc179);
__m512 me176 = _mm512_loadu_ps(means26+(ptrdiff_t)16*0+(ptrdiff_t)80*10);
__m512 me177 = _mm512_loadu_ps(means26+(ptrdiff_t)16*1+(ptrdiff_t)80*10);
__m512 me178 = _mm512_loadu_ps(means26+(ptrdiff_t)16*2+(ptrdiff_t)80*10);
__m512 me179 = _mm512_loadu_ps(means26+(ptrdiff_t)16*3+(ptrdiff_t)80*10);
__m512 sh176 = _mm512_loadu_ps(shifts26+(ptrdiff_t)16*0+(ptrdiff_t)80*10);
__m512 sh177 = _mm512_loadu_ps(shifts26+(ptrdiff_t)16*1+(ptrdiff_t)80*10);
__m512 sh178 = _mm512_loadu_ps(shifts26+(ptrdiff_t)16*2+(ptrdiff_t)80*10);
__m512 sh179 = _mm512_loadu_ps(shifts26+(ptrdiff_t)16*3+(ptrdiff_t)80*10);
__m512 add176 = _mm512_fnmadd_ps(me176, mul176, sh176);
__m512 add177 = _mm512_fnmadd_ps(me177, mul177, sh177);
__m512 add178 = _mm512_fnmadd_ps(me178, mul178, sh178);
__m512 add179 = _mm512_fnmadd_ps(me179, mul179, sh179);
__m512 lo176 = _mm512_permutex2var_ps(mul176, xlo26, add176);
__m512 lo177 = _mm512_permutex2var_ps(mul177, xlo26, add177);
__m512 lo178 = _mm512_permutex2var_ps(mul178, xlo26, add178);
__m512 lo179 = _mm512_permutex2var_ps(mul179, xlo26, add179);
__m512 hi182 = _mm512_permutex2var_ps(mul176, xhi26, add176);
__m512 hi183 = _mm512_permutex2var_ps(mul177, xhi26, add177);
__m512 hi184 = _mm512_permutex2var_ps(mul178, xhi26, add178);
__m512 hi185 = _mm512_permutex2var_ps(mul179, xhi26, add179);
_mm512_storeu_ps(mas27+(ptrdiff_t)64*0+(ptrdiff_t)640*10, lo176);
_mm512_storeu_ps(mas27+(ptrdiff_t)64*1+(ptrdiff_t)640*10, hi182);
_mm512_storeu_ps(mas27+(ptrdiff_t)64*2+(ptrdiff_t)640*10, lo177);
_mm512_storeu_ps(mas27+(ptrdiff_t)64*3+(ptrdiff_t)640*10, hi183);
_mm512_storeu_ps(mas27+(ptrdiff_t)64*4+(ptrdiff_t)640*10, lo178);
_mm512_storeu_ps(mas27+(ptrdiff_t)64*5+(ptrdiff_t)640*10, hi184);
_mm512_storeu_ps(mas27+(ptrdiff_t)64*6+(ptrdiff_t)640*10, lo179);
_mm512_storeu_ps(mas27+(ptrdiff_t)64*7+(ptrdiff_t)640*10, hi185);
}

static void DenseNet121BnSimplify27(
float*restrict means27,
float*restrict variances27,
float*restrict scales27,
float*restrict shifts27,
char*restrict mas28
) {
__m512 eps27 = _mm512_set1_ps(1e-05f);
__m512i xlo27 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi27 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i172 = 0; i172 < 11; ++i172) {
__m512 va180 = _mm512_loadu_ps(variances27+(ptrdiff_t)16*0+(ptrdiff_t)80*i172);
__m512 va181 = _mm512_loadu_ps(variances27+(ptrdiff_t)16*1+(ptrdiff_t)80*i172);
__m512 va182 = _mm512_loadu_ps(variances27+(ptrdiff_t)16*2+(ptrdiff_t)80*i172);
__m512 va183 = _mm512_loadu_ps(variances27+(ptrdiff_t)16*3+(ptrdiff_t)80*i172);
__m512 va184 = _mm512_loadu_ps(variances27+(ptrdiff_t)16*4+(ptrdiff_t)80*i172);
__m512 rcp180 = DenseNet121Rsqrt1(_mm512_add_ps(eps27, va180));
__m512 rcp181 = DenseNet121Rsqrt1(_mm512_add_ps(eps27, va181));
__m512 rcp182 = DenseNet121Rsqrt1(_mm512_add_ps(eps27, va182));
__m512 rcp183 = DenseNet121Rsqrt1(_mm512_add_ps(eps27, va183));
__m512 rcp184 = DenseNet121Rsqrt1(_mm512_add_ps(eps27, va184));
__m512 sc180 = _mm512_loadu_ps(scales27+(ptrdiff_t)16*0+(ptrdiff_t)80*i172);
__m512 sc181 = _mm512_loadu_ps(scales27+(ptrdiff_t)16*1+(ptrdiff_t)80*i172);
__m512 sc182 = _mm512_loadu_ps(scales27+(ptrdiff_t)16*2+(ptrdiff_t)80*i172);
__m512 sc183 = _mm512_loadu_ps(scales27+(ptrdiff_t)16*3+(ptrdiff_t)80*i172);
__m512 sc184 = _mm512_loadu_ps(scales27+(ptrdiff_t)16*4+(ptrdiff_t)80*i172);
__m512 mul180 = _mm512_mul_ps(rcp180, sc180);
__m512 mul181 = _mm512_mul_ps(rcp181, sc181);
__m512 mul182 = _mm512_mul_ps(rcp182, sc182);
__m512 mul183 = _mm512_mul_ps(rcp183, sc183);
__m512 mul184 = _mm512_mul_ps(rcp184, sc184);
__m512 me180 = _mm512_loadu_ps(means27+(ptrdiff_t)16*0+(ptrdiff_t)80*i172);
__m512 me181 = _mm512_loadu_ps(means27+(ptrdiff_t)16*1+(ptrdiff_t)80*i172);
__m512 me182 = _mm512_loadu_ps(means27+(ptrdiff_t)16*2+(ptrdiff_t)80*i172);
__m512 me183 = _mm512_loadu_ps(means27+(ptrdiff_t)16*3+(ptrdiff_t)80*i172);
__m512 me184 = _mm512_loadu_ps(means27+(ptrdiff_t)16*4+(ptrdiff_t)80*i172);
__m512 sh180 = _mm512_loadu_ps(shifts27+(ptrdiff_t)16*0+(ptrdiff_t)80*i172);
__m512 sh181 = _mm512_loadu_ps(shifts27+(ptrdiff_t)16*1+(ptrdiff_t)80*i172);
__m512 sh182 = _mm512_loadu_ps(shifts27+(ptrdiff_t)16*2+(ptrdiff_t)80*i172);
__m512 sh183 = _mm512_loadu_ps(shifts27+(ptrdiff_t)16*3+(ptrdiff_t)80*i172);
__m512 sh184 = _mm512_loadu_ps(shifts27+(ptrdiff_t)16*4+(ptrdiff_t)80*i172);
__m512 add180 = _mm512_fnmadd_ps(me180, mul180, sh180);
__m512 add181 = _mm512_fnmadd_ps(me181, mul181, sh181);
__m512 add182 = _mm512_fnmadd_ps(me182, mul182, sh182);
__m512 add183 = _mm512_fnmadd_ps(me183, mul183, sh183);
__m512 add184 = _mm512_fnmadd_ps(me184, mul184, sh184);
__m512 lo180 = _mm512_permutex2var_ps(mul180, xlo27, add180);
__m512 lo181 = _mm512_permutex2var_ps(mul181, xlo27, add181);
__m512 lo182 = _mm512_permutex2var_ps(mul182, xlo27, add182);
__m512 lo183 = _mm512_permutex2var_ps(mul183, xlo27, add183);
__m512 lo184 = _mm512_permutex2var_ps(mul184, xlo27, add184);
__m512 hi186 = _mm512_permutex2var_ps(mul180, xhi27, add180);
__m512 hi187 = _mm512_permutex2var_ps(mul181, xhi27, add181);
__m512 hi188 = _mm512_permutex2var_ps(mul182, xhi27, add182);
__m512 hi189 = _mm512_permutex2var_ps(mul183, xhi27, add183);
__m512 hi190 = _mm512_permutex2var_ps(mul184, xhi27, add184);
_mm512_storeu_ps(mas28+(ptrdiff_t)64*0+(ptrdiff_t)640*i172, lo180);
_mm512_storeu_ps(mas28+(ptrdiff_t)64*1+(ptrdiff_t)640*i172, hi186);
_mm512_storeu_ps(mas28+(ptrdiff_t)64*2+(ptrdiff_t)640*i172, lo181);
_mm512_storeu_ps(mas28+(ptrdiff_t)64*3+(ptrdiff_t)640*i172, hi187);
_mm512_storeu_ps(mas28+(ptrdiff_t)64*4+(ptrdiff_t)640*i172, lo182);
_mm512_storeu_ps(mas28+(ptrdiff_t)64*5+(ptrdiff_t)640*i172, hi188);
_mm512_storeu_ps(mas28+(ptrdiff_t)64*6+(ptrdiff_t)640*i172, lo183);
_mm512_storeu_ps(mas28+(ptrdiff_t)64*7+(ptrdiff_t)640*i172, hi189);
_mm512_storeu_ps(mas28+(ptrdiff_t)64*8+(ptrdiff_t)640*i172, lo184);
_mm512_storeu_ps(mas28+(ptrdiff_t)64*9+(ptrdiff_t)640*i172, hi190);
}
__m512 va185 = _mm512_loadu_ps(variances27+(ptrdiff_t)16*0+(ptrdiff_t)80*11);
__m512 rcp185 = DenseNet121Rsqrt1(_mm512_add_ps(eps27, va185));
__m512 sc185 = _mm512_loadu_ps(scales27+(ptrdiff_t)16*0+(ptrdiff_t)80*11);
__m512 mul185 = _mm512_mul_ps(rcp185, sc185);
__m512 me185 = _mm512_loadu_ps(means27+(ptrdiff_t)16*0+(ptrdiff_t)80*11);
__m512 sh185 = _mm512_loadu_ps(shifts27+(ptrdiff_t)16*0+(ptrdiff_t)80*11);
__m512 add185 = _mm512_fnmadd_ps(me185, mul185, sh185);
__m512 lo185 = _mm512_permutex2var_ps(mul185, xlo27, add185);
__m512 hi191 = _mm512_permutex2var_ps(mul185, xhi27, add185);
_mm512_storeu_ps(mas28+(ptrdiff_t)64*0+(ptrdiff_t)640*11, lo185);
_mm512_storeu_ps(mas28+(ptrdiff_t)64*1+(ptrdiff_t)640*11, hi191);
}

static void DenseNet121BnSimplify28(
float*restrict means28,
float*restrict variances28,
float*restrict scales28,
float*restrict shifts28,
char*restrict mas29
) {
__m512 eps28 = _mm512_set1_ps(1e-05f);
__m512i xlo28 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi28 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i176 = 0; i176 < 11; ++i176) {
__m512 va186 = _mm512_loadu_ps(variances28+(ptrdiff_t)16*0+(ptrdiff_t)80*i176);
__m512 va187 = _mm512_loadu_ps(variances28+(ptrdiff_t)16*1+(ptrdiff_t)80*i176);
__m512 va188 = _mm512_loadu_ps(variances28+(ptrdiff_t)16*2+(ptrdiff_t)80*i176);
__m512 va189 = _mm512_loadu_ps(variances28+(ptrdiff_t)16*3+(ptrdiff_t)80*i176);
__m512 va190 = _mm512_loadu_ps(variances28+(ptrdiff_t)16*4+(ptrdiff_t)80*i176);
__m512 rcp186 = DenseNet121Rsqrt1(_mm512_add_ps(eps28, va186));
__m512 rcp187 = DenseNet121Rsqrt1(_mm512_add_ps(eps28, va187));
__m512 rcp188 = DenseNet121Rsqrt1(_mm512_add_ps(eps28, va188));
__m512 rcp189 = DenseNet121Rsqrt1(_mm512_add_ps(eps28, va189));
__m512 rcp190 = DenseNet121Rsqrt1(_mm512_add_ps(eps28, va190));
__m512 sc186 = _mm512_loadu_ps(scales28+(ptrdiff_t)16*0+(ptrdiff_t)80*i176);
__m512 sc187 = _mm512_loadu_ps(scales28+(ptrdiff_t)16*1+(ptrdiff_t)80*i176);
__m512 sc188 = _mm512_loadu_ps(scales28+(ptrdiff_t)16*2+(ptrdiff_t)80*i176);
__m512 sc189 = _mm512_loadu_ps(scales28+(ptrdiff_t)16*3+(ptrdiff_t)80*i176);
__m512 sc190 = _mm512_loadu_ps(scales28+(ptrdiff_t)16*4+(ptrdiff_t)80*i176);
__m512 mul186 = _mm512_mul_ps(rcp186, sc186);
__m512 mul187 = _mm512_mul_ps(rcp187, sc187);
__m512 mul188 = _mm512_mul_ps(rcp188, sc188);
__m512 mul189 = _mm512_mul_ps(rcp189, sc189);
__m512 mul190 = _mm512_mul_ps(rcp190, sc190);
__m512 me186 = _mm512_loadu_ps(means28+(ptrdiff_t)16*0+(ptrdiff_t)80*i176);
__m512 me187 = _mm512_loadu_ps(means28+(ptrdiff_t)16*1+(ptrdiff_t)80*i176);
__m512 me188 = _mm512_loadu_ps(means28+(ptrdiff_t)16*2+(ptrdiff_t)80*i176);
__m512 me189 = _mm512_loadu_ps(means28+(ptrdiff_t)16*3+(ptrdiff_t)80*i176);
__m512 me190 = _mm512_loadu_ps(means28+(ptrdiff_t)16*4+(ptrdiff_t)80*i176);
__m512 sh186 = _mm512_loadu_ps(shifts28+(ptrdiff_t)16*0+(ptrdiff_t)80*i176);
__m512 sh187 = _mm512_loadu_ps(shifts28+(ptrdiff_t)16*1+(ptrdiff_t)80*i176);
__m512 sh188 = _mm512_loadu_ps(shifts28+(ptrdiff_t)16*2+(ptrdiff_t)80*i176);
__m512 sh189 = _mm512_loadu_ps(shifts28+(ptrdiff_t)16*3+(ptrdiff_t)80*i176);
__m512 sh190 = _mm512_loadu_ps(shifts28+(ptrdiff_t)16*4+(ptrdiff_t)80*i176);
__m512 add186 = _mm512_fnmadd_ps(me186, mul186, sh186);
__m512 add187 = _mm512_fnmadd_ps(me187, mul187, sh187);
__m512 add188 = _mm512_fnmadd_ps(me188, mul188, sh188);
__m512 add189 = _mm512_fnmadd_ps(me189, mul189, sh189);
__m512 add190 = _mm512_fnmadd_ps(me190, mul190, sh190);
__m512 lo186 = _mm512_permutex2var_ps(mul186, xlo28, add186);
__m512 lo187 = _mm512_permutex2var_ps(mul187, xlo28, add187);
__m512 lo188 = _mm512_permutex2var_ps(mul188, xlo28, add188);
__m512 lo189 = _mm512_permutex2var_ps(mul189, xlo28, add189);
__m512 lo190 = _mm512_permutex2var_ps(mul190, xlo28, add190);
__m512 hi192 = _mm512_permutex2var_ps(mul186, xhi28, add186);
__m512 hi193 = _mm512_permutex2var_ps(mul187, xhi28, add187);
__m512 hi194 = _mm512_permutex2var_ps(mul188, xhi28, add188);
__m512 hi195 = _mm512_permutex2var_ps(mul189, xhi28, add189);
__m512 hi196 = _mm512_permutex2var_ps(mul190, xhi28, add190);
_mm512_storeu_ps(mas29+(ptrdiff_t)64*0+(ptrdiff_t)640*i176, lo186);
_mm512_storeu_ps(mas29+(ptrdiff_t)64*1+(ptrdiff_t)640*i176, hi192);
_mm512_storeu_ps(mas29+(ptrdiff_t)64*2+(ptrdiff_t)640*i176, lo187);
_mm512_storeu_ps(mas29+(ptrdiff_t)64*3+(ptrdiff_t)640*i176, hi193);
_mm512_storeu_ps(mas29+(ptrdiff_t)64*4+(ptrdiff_t)640*i176, lo188);
_mm512_storeu_ps(mas29+(ptrdiff_t)64*5+(ptrdiff_t)640*i176, hi194);
_mm512_storeu_ps(mas29+(ptrdiff_t)64*6+(ptrdiff_t)640*i176, lo189);
_mm512_storeu_ps(mas29+(ptrdiff_t)64*7+(ptrdiff_t)640*i176, hi195);
_mm512_storeu_ps(mas29+(ptrdiff_t)64*8+(ptrdiff_t)640*i176, lo190);
_mm512_storeu_ps(mas29+(ptrdiff_t)64*9+(ptrdiff_t)640*i176, hi196);
}
__m512 va191 = _mm512_loadu_ps(variances28+(ptrdiff_t)16*0+(ptrdiff_t)80*11);
__m512 va192 = _mm512_loadu_ps(variances28+(ptrdiff_t)16*1+(ptrdiff_t)80*11);
__m512 va193 = _mm512_loadu_ps(variances28+(ptrdiff_t)16*2+(ptrdiff_t)80*11);
__m512 rcp191 = DenseNet121Rsqrt1(_mm512_add_ps(eps28, va191));
__m512 rcp192 = DenseNet121Rsqrt1(_mm512_add_ps(eps28, va192));
__m512 rcp193 = DenseNet121Rsqrt1(_mm512_add_ps(eps28, va193));
__m512 sc191 = _mm512_loadu_ps(scales28+(ptrdiff_t)16*0+(ptrdiff_t)80*11);
__m512 sc192 = _mm512_loadu_ps(scales28+(ptrdiff_t)16*1+(ptrdiff_t)80*11);
__m512 sc193 = _mm512_loadu_ps(scales28+(ptrdiff_t)16*2+(ptrdiff_t)80*11);
__m512 mul191 = _mm512_mul_ps(rcp191, sc191);
__m512 mul192 = _mm512_mul_ps(rcp192, sc192);
__m512 mul193 = _mm512_mul_ps(rcp193, sc193);
__m512 me191 = _mm512_loadu_ps(means28+(ptrdiff_t)16*0+(ptrdiff_t)80*11);
__m512 me192 = _mm512_loadu_ps(means28+(ptrdiff_t)16*1+(ptrdiff_t)80*11);
__m512 me193 = _mm512_loadu_ps(means28+(ptrdiff_t)16*2+(ptrdiff_t)80*11);
__m512 sh191 = _mm512_loadu_ps(shifts28+(ptrdiff_t)16*0+(ptrdiff_t)80*11);
__m512 sh192 = _mm512_loadu_ps(shifts28+(ptrdiff_t)16*1+(ptrdiff_t)80*11);
__m512 sh193 = _mm512_loadu_ps(shifts28+(ptrdiff_t)16*2+(ptrdiff_t)80*11);
__m512 add191 = _mm512_fnmadd_ps(me191, mul191, sh191);
__m512 add192 = _mm512_fnmadd_ps(me192, mul192, sh192);
__m512 add193 = _mm512_fnmadd_ps(me193, mul193, sh193);
__m512 lo191 = _mm512_permutex2var_ps(mul191, xlo28, add191);
__m512 lo192 = _mm512_permutex2var_ps(mul192, xlo28, add192);
__m512 lo193 = _mm512_permutex2var_ps(mul193, xlo28, add193);
__m512 hi197 = _mm512_permutex2var_ps(mul191, xhi28, add191);
__m512 hi198 = _mm512_permutex2var_ps(mul192, xhi28, add192);
__m512 hi199 = _mm512_permutex2var_ps(mul193, xhi28, add193);
_mm512_storeu_ps(mas29+(ptrdiff_t)64*0+(ptrdiff_t)640*11, lo191);
_mm512_storeu_ps(mas29+(ptrdiff_t)64*1+(ptrdiff_t)640*11, hi197);
_mm512_storeu_ps(mas29+(ptrdiff_t)64*2+(ptrdiff_t)640*11, lo192);
_mm512_storeu_ps(mas29+(ptrdiff_t)64*3+(ptrdiff_t)640*11, hi198);
_mm512_storeu_ps(mas29+(ptrdiff_t)64*4+(ptrdiff_t)640*11, lo193);
_mm512_storeu_ps(mas29+(ptrdiff_t)64*5+(ptrdiff_t)640*11, hi199);
}

static void DenseNet121BnSimplify29(
float*restrict means29,
float*restrict variances29,
float*restrict scales29,
float*restrict shifts29,
char*restrict mas30
) {
__m512 eps29 = _mm512_set1_ps(1e-05f);
__m512i xlo29 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi29 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i180 = 0; i180 < 12; ++i180) {
__m512 va194 = _mm512_loadu_ps(variances29+(ptrdiff_t)16*0+(ptrdiff_t)80*i180);
__m512 va195 = _mm512_loadu_ps(variances29+(ptrdiff_t)16*1+(ptrdiff_t)80*i180);
__m512 va196 = _mm512_loadu_ps(variances29+(ptrdiff_t)16*2+(ptrdiff_t)80*i180);
__m512 va197 = _mm512_loadu_ps(variances29+(ptrdiff_t)16*3+(ptrdiff_t)80*i180);
__m512 va198 = _mm512_loadu_ps(variances29+(ptrdiff_t)16*4+(ptrdiff_t)80*i180);
__m512 rcp194 = DenseNet121Rsqrt1(_mm512_add_ps(eps29, va194));
__m512 rcp195 = DenseNet121Rsqrt1(_mm512_add_ps(eps29, va195));
__m512 rcp196 = DenseNet121Rsqrt1(_mm512_add_ps(eps29, va196));
__m512 rcp197 = DenseNet121Rsqrt1(_mm512_add_ps(eps29, va197));
__m512 rcp198 = DenseNet121Rsqrt1(_mm512_add_ps(eps29, va198));
__m512 sc194 = _mm512_loadu_ps(scales29+(ptrdiff_t)16*0+(ptrdiff_t)80*i180);
__m512 sc195 = _mm512_loadu_ps(scales29+(ptrdiff_t)16*1+(ptrdiff_t)80*i180);
__m512 sc196 = _mm512_loadu_ps(scales29+(ptrdiff_t)16*2+(ptrdiff_t)80*i180);
__m512 sc197 = _mm512_loadu_ps(scales29+(ptrdiff_t)16*3+(ptrdiff_t)80*i180);
__m512 sc198 = _mm512_loadu_ps(scales29+(ptrdiff_t)16*4+(ptrdiff_t)80*i180);
__m512 mul194 = _mm512_mul_ps(rcp194, sc194);
__m512 mul195 = _mm512_mul_ps(rcp195, sc195);
__m512 mul196 = _mm512_mul_ps(rcp196, sc196);
__m512 mul197 = _mm512_mul_ps(rcp197, sc197);
__m512 mul198 = _mm512_mul_ps(rcp198, sc198);
__m512 me194 = _mm512_loadu_ps(means29+(ptrdiff_t)16*0+(ptrdiff_t)80*i180);
__m512 me195 = _mm512_loadu_ps(means29+(ptrdiff_t)16*1+(ptrdiff_t)80*i180);
__m512 me196 = _mm512_loadu_ps(means29+(ptrdiff_t)16*2+(ptrdiff_t)80*i180);
__m512 me197 = _mm512_loadu_ps(means29+(ptrdiff_t)16*3+(ptrdiff_t)80*i180);
__m512 me198 = _mm512_loadu_ps(means29+(ptrdiff_t)16*4+(ptrdiff_t)80*i180);
__m512 sh194 = _mm512_loadu_ps(shifts29+(ptrdiff_t)16*0+(ptrdiff_t)80*i180);
__m512 sh195 = _mm512_loadu_ps(shifts29+(ptrdiff_t)16*1+(ptrdiff_t)80*i180);
__m512 sh196 = _mm512_loadu_ps(shifts29+(ptrdiff_t)16*2+(ptrdiff_t)80*i180);
__m512 sh197 = _mm512_loadu_ps(shifts29+(ptrdiff_t)16*3+(ptrdiff_t)80*i180);
__m512 sh198 = _mm512_loadu_ps(shifts29+(ptrdiff_t)16*4+(ptrdiff_t)80*i180);
__m512 add194 = _mm512_fnmadd_ps(me194, mul194, sh194);
__m512 add195 = _mm512_fnmadd_ps(me195, mul195, sh195);
__m512 add196 = _mm512_fnmadd_ps(me196, mul196, sh196);
__m512 add197 = _mm512_fnmadd_ps(me197, mul197, sh197);
__m512 add198 = _mm512_fnmadd_ps(me198, mul198, sh198);
__m512 lo194 = _mm512_permutex2var_ps(mul194, xlo29, add194);
__m512 lo195 = _mm512_permutex2var_ps(mul195, xlo29, add195);
__m512 lo196 = _mm512_permutex2var_ps(mul196, xlo29, add196);
__m512 lo197 = _mm512_permutex2var_ps(mul197, xlo29, add197);
__m512 lo198 = _mm512_permutex2var_ps(mul198, xlo29, add198);
__m512 hi200 = _mm512_permutex2var_ps(mul194, xhi29, add194);
__m512 hi201 = _mm512_permutex2var_ps(mul195, xhi29, add195);
__m512 hi202 = _mm512_permutex2var_ps(mul196, xhi29, add196);
__m512 hi203 = _mm512_permutex2var_ps(mul197, xhi29, add197);
__m512 hi204 = _mm512_permutex2var_ps(mul198, xhi29, add198);
_mm512_storeu_ps(mas30+(ptrdiff_t)64*0+(ptrdiff_t)640*i180, lo194);
_mm512_storeu_ps(mas30+(ptrdiff_t)64*1+(ptrdiff_t)640*i180, hi200);
_mm512_storeu_ps(mas30+(ptrdiff_t)64*2+(ptrdiff_t)640*i180, lo195);
_mm512_storeu_ps(mas30+(ptrdiff_t)64*3+(ptrdiff_t)640*i180, hi201);
_mm512_storeu_ps(mas30+(ptrdiff_t)64*4+(ptrdiff_t)640*i180, lo196);
_mm512_storeu_ps(mas30+(ptrdiff_t)64*5+(ptrdiff_t)640*i180, hi202);
_mm512_storeu_ps(mas30+(ptrdiff_t)64*6+(ptrdiff_t)640*i180, lo197);
_mm512_storeu_ps(mas30+(ptrdiff_t)64*7+(ptrdiff_t)640*i180, hi203);
_mm512_storeu_ps(mas30+(ptrdiff_t)64*8+(ptrdiff_t)640*i180, lo198);
_mm512_storeu_ps(mas30+(ptrdiff_t)64*9+(ptrdiff_t)640*i180, hi204);
}
}

static void DenseNet121BnSimplify30(
float*restrict means30,
float*restrict variances30,
float*restrict scales30,
float*restrict shifts30,
char*restrict mas31
) {
__m512 eps30 = _mm512_set1_ps(1e-05f);
__m512i xlo30 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi30 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i184 = 0; i184 < 12; ++i184) {
__m512 va199 = _mm512_loadu_ps(variances30+(ptrdiff_t)16*0+(ptrdiff_t)80*i184);
__m512 va200 = _mm512_loadu_ps(variances30+(ptrdiff_t)16*1+(ptrdiff_t)80*i184);
__m512 va201 = _mm512_loadu_ps(variances30+(ptrdiff_t)16*2+(ptrdiff_t)80*i184);
__m512 va202 = _mm512_loadu_ps(variances30+(ptrdiff_t)16*3+(ptrdiff_t)80*i184);
__m512 va203 = _mm512_loadu_ps(variances30+(ptrdiff_t)16*4+(ptrdiff_t)80*i184);
__m512 rcp199 = DenseNet121Rsqrt1(_mm512_add_ps(eps30, va199));
__m512 rcp200 = DenseNet121Rsqrt1(_mm512_add_ps(eps30, va200));
__m512 rcp201 = DenseNet121Rsqrt1(_mm512_add_ps(eps30, va201));
__m512 rcp202 = DenseNet121Rsqrt1(_mm512_add_ps(eps30, va202));
__m512 rcp203 = DenseNet121Rsqrt1(_mm512_add_ps(eps30, va203));
__m512 sc199 = _mm512_loadu_ps(scales30+(ptrdiff_t)16*0+(ptrdiff_t)80*i184);
__m512 sc200 = _mm512_loadu_ps(scales30+(ptrdiff_t)16*1+(ptrdiff_t)80*i184);
__m512 sc201 = _mm512_loadu_ps(scales30+(ptrdiff_t)16*2+(ptrdiff_t)80*i184);
__m512 sc202 = _mm512_loadu_ps(scales30+(ptrdiff_t)16*3+(ptrdiff_t)80*i184);
__m512 sc203 = _mm512_loadu_ps(scales30+(ptrdiff_t)16*4+(ptrdiff_t)80*i184);
__m512 mul199 = _mm512_mul_ps(rcp199, sc199);
__m512 mul200 = _mm512_mul_ps(rcp200, sc200);
__m512 mul201 = _mm512_mul_ps(rcp201, sc201);
__m512 mul202 = _mm512_mul_ps(rcp202, sc202);
__m512 mul203 = _mm512_mul_ps(rcp203, sc203);
__m512 me199 = _mm512_loadu_ps(means30+(ptrdiff_t)16*0+(ptrdiff_t)80*i184);
__m512 me200 = _mm512_loadu_ps(means30+(ptrdiff_t)16*1+(ptrdiff_t)80*i184);
__m512 me201 = _mm512_loadu_ps(means30+(ptrdiff_t)16*2+(ptrdiff_t)80*i184);
__m512 me202 = _mm512_loadu_ps(means30+(ptrdiff_t)16*3+(ptrdiff_t)80*i184);
__m512 me203 = _mm512_loadu_ps(means30+(ptrdiff_t)16*4+(ptrdiff_t)80*i184);
__m512 sh199 = _mm512_loadu_ps(shifts30+(ptrdiff_t)16*0+(ptrdiff_t)80*i184);
__m512 sh200 = _mm512_loadu_ps(shifts30+(ptrdiff_t)16*1+(ptrdiff_t)80*i184);
__m512 sh201 = _mm512_loadu_ps(shifts30+(ptrdiff_t)16*2+(ptrdiff_t)80*i184);
__m512 sh202 = _mm512_loadu_ps(shifts30+(ptrdiff_t)16*3+(ptrdiff_t)80*i184);
__m512 sh203 = _mm512_loadu_ps(shifts30+(ptrdiff_t)16*4+(ptrdiff_t)80*i184);
__m512 add199 = _mm512_fnmadd_ps(me199, mul199, sh199);
__m512 add200 = _mm512_fnmadd_ps(me200, mul200, sh200);
__m512 add201 = _mm512_fnmadd_ps(me201, mul201, sh201);
__m512 add202 = _mm512_fnmadd_ps(me202, mul202, sh202);
__m512 add203 = _mm512_fnmadd_ps(me203, mul203, sh203);
__m512 lo199 = _mm512_permutex2var_ps(mul199, xlo30, add199);
__m512 lo200 = _mm512_permutex2var_ps(mul200, xlo30, add200);
__m512 lo201 = _mm512_permutex2var_ps(mul201, xlo30, add201);
__m512 lo202 = _mm512_permutex2var_ps(mul202, xlo30, add202);
__m512 lo203 = _mm512_permutex2var_ps(mul203, xlo30, add203);
__m512 hi205 = _mm512_permutex2var_ps(mul199, xhi30, add199);
__m512 hi206 = _mm512_permutex2var_ps(mul200, xhi30, add200);
__m512 hi207 = _mm512_permutex2var_ps(mul201, xhi30, add201);
__m512 hi208 = _mm512_permutex2var_ps(mul202, xhi30, add202);
__m512 hi209 = _mm512_permutex2var_ps(mul203, xhi30, add203);
_mm512_storeu_ps(mas31+(ptrdiff_t)64*0+(ptrdiff_t)640*i184, lo199);
_mm512_storeu_ps(mas31+(ptrdiff_t)64*1+(ptrdiff_t)640*i184, hi205);
_mm512_storeu_ps(mas31+(ptrdiff_t)64*2+(ptrdiff_t)640*i184, lo200);
_mm512_storeu_ps(mas31+(ptrdiff_t)64*3+(ptrdiff_t)640*i184, hi206);
_mm512_storeu_ps(mas31+(ptrdiff_t)64*4+(ptrdiff_t)640*i184, lo201);
_mm512_storeu_ps(mas31+(ptrdiff_t)64*5+(ptrdiff_t)640*i184, hi207);
_mm512_storeu_ps(mas31+(ptrdiff_t)64*6+(ptrdiff_t)640*i184, lo202);
_mm512_storeu_ps(mas31+(ptrdiff_t)64*7+(ptrdiff_t)640*i184, hi208);
_mm512_storeu_ps(mas31+(ptrdiff_t)64*8+(ptrdiff_t)640*i184, lo203);
_mm512_storeu_ps(mas31+(ptrdiff_t)64*9+(ptrdiff_t)640*i184, hi209);
}
__m512 va204 = _mm512_loadu_ps(variances30+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 va205 = _mm512_loadu_ps(variances30+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 rcp204 = DenseNet121Rsqrt1(_mm512_add_ps(eps30, va204));
__m512 rcp205 = DenseNet121Rsqrt1(_mm512_add_ps(eps30, va205));
__m512 sc204 = _mm512_loadu_ps(scales30+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 sc205 = _mm512_loadu_ps(scales30+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 mul204 = _mm512_mul_ps(rcp204, sc204);
__m512 mul205 = _mm512_mul_ps(rcp205, sc205);
__m512 me204 = _mm512_loadu_ps(means30+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 me205 = _mm512_loadu_ps(means30+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 sh204 = _mm512_loadu_ps(shifts30+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 sh205 = _mm512_loadu_ps(shifts30+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 add204 = _mm512_fnmadd_ps(me204, mul204, sh204);
__m512 add205 = _mm512_fnmadd_ps(me205, mul205, sh205);
__m512 lo204 = _mm512_permutex2var_ps(mul204, xlo30, add204);
__m512 lo205 = _mm512_permutex2var_ps(mul205, xlo30, add205);
__m512 hi210 = _mm512_permutex2var_ps(mul204, xhi30, add204);
__m512 hi211 = _mm512_permutex2var_ps(mul205, xhi30, add205);
_mm512_storeu_ps(mas31+(ptrdiff_t)64*0+(ptrdiff_t)640*12, lo204);
_mm512_storeu_ps(mas31+(ptrdiff_t)64*1+(ptrdiff_t)640*12, hi210);
_mm512_storeu_ps(mas31+(ptrdiff_t)64*2+(ptrdiff_t)640*12, lo205);
_mm512_storeu_ps(mas31+(ptrdiff_t)64*3+(ptrdiff_t)640*12, hi211);
}

static void DenseNet121BnSimplify31(
float*restrict means31,
float*restrict variances31,
float*restrict scales31,
float*restrict shifts31,
char*restrict mas32
) {
__m512 eps31 = _mm512_set1_ps(1e-05f);
__m512i xlo31 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi31 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i188 = 0; i188 < 12; ++i188) {
__m512 va206 = _mm512_loadu_ps(variances31+(ptrdiff_t)16*0+(ptrdiff_t)80*i188);
__m512 va207 = _mm512_loadu_ps(variances31+(ptrdiff_t)16*1+(ptrdiff_t)80*i188);
__m512 va208 = _mm512_loadu_ps(variances31+(ptrdiff_t)16*2+(ptrdiff_t)80*i188);
__m512 va209 = _mm512_loadu_ps(variances31+(ptrdiff_t)16*3+(ptrdiff_t)80*i188);
__m512 va210 = _mm512_loadu_ps(variances31+(ptrdiff_t)16*4+(ptrdiff_t)80*i188);
__m512 rcp206 = DenseNet121Rsqrt1(_mm512_add_ps(eps31, va206));
__m512 rcp207 = DenseNet121Rsqrt1(_mm512_add_ps(eps31, va207));
__m512 rcp208 = DenseNet121Rsqrt1(_mm512_add_ps(eps31, va208));
__m512 rcp209 = DenseNet121Rsqrt1(_mm512_add_ps(eps31, va209));
__m512 rcp210 = DenseNet121Rsqrt1(_mm512_add_ps(eps31, va210));
__m512 sc206 = _mm512_loadu_ps(scales31+(ptrdiff_t)16*0+(ptrdiff_t)80*i188);
__m512 sc207 = _mm512_loadu_ps(scales31+(ptrdiff_t)16*1+(ptrdiff_t)80*i188);
__m512 sc208 = _mm512_loadu_ps(scales31+(ptrdiff_t)16*2+(ptrdiff_t)80*i188);
__m512 sc209 = _mm512_loadu_ps(scales31+(ptrdiff_t)16*3+(ptrdiff_t)80*i188);
__m512 sc210 = _mm512_loadu_ps(scales31+(ptrdiff_t)16*4+(ptrdiff_t)80*i188);
__m512 mul206 = _mm512_mul_ps(rcp206, sc206);
__m512 mul207 = _mm512_mul_ps(rcp207, sc207);
__m512 mul208 = _mm512_mul_ps(rcp208, sc208);
__m512 mul209 = _mm512_mul_ps(rcp209, sc209);
__m512 mul210 = _mm512_mul_ps(rcp210, sc210);
__m512 me206 = _mm512_loadu_ps(means31+(ptrdiff_t)16*0+(ptrdiff_t)80*i188);
__m512 me207 = _mm512_loadu_ps(means31+(ptrdiff_t)16*1+(ptrdiff_t)80*i188);
__m512 me208 = _mm512_loadu_ps(means31+(ptrdiff_t)16*2+(ptrdiff_t)80*i188);
__m512 me209 = _mm512_loadu_ps(means31+(ptrdiff_t)16*3+(ptrdiff_t)80*i188);
__m512 me210 = _mm512_loadu_ps(means31+(ptrdiff_t)16*4+(ptrdiff_t)80*i188);
__m512 sh206 = _mm512_loadu_ps(shifts31+(ptrdiff_t)16*0+(ptrdiff_t)80*i188);
__m512 sh207 = _mm512_loadu_ps(shifts31+(ptrdiff_t)16*1+(ptrdiff_t)80*i188);
__m512 sh208 = _mm512_loadu_ps(shifts31+(ptrdiff_t)16*2+(ptrdiff_t)80*i188);
__m512 sh209 = _mm512_loadu_ps(shifts31+(ptrdiff_t)16*3+(ptrdiff_t)80*i188);
__m512 sh210 = _mm512_loadu_ps(shifts31+(ptrdiff_t)16*4+(ptrdiff_t)80*i188);
__m512 add206 = _mm512_fnmadd_ps(me206, mul206, sh206);
__m512 add207 = _mm512_fnmadd_ps(me207, mul207, sh207);
__m512 add208 = _mm512_fnmadd_ps(me208, mul208, sh208);
__m512 add209 = _mm512_fnmadd_ps(me209, mul209, sh209);
__m512 add210 = _mm512_fnmadd_ps(me210, mul210, sh210);
__m512 lo206 = _mm512_permutex2var_ps(mul206, xlo31, add206);
__m512 lo207 = _mm512_permutex2var_ps(mul207, xlo31, add207);
__m512 lo208 = _mm512_permutex2var_ps(mul208, xlo31, add208);
__m512 lo209 = _mm512_permutex2var_ps(mul209, xlo31, add209);
__m512 lo210 = _mm512_permutex2var_ps(mul210, xlo31, add210);
__m512 hi212 = _mm512_permutex2var_ps(mul206, xhi31, add206);
__m512 hi213 = _mm512_permutex2var_ps(mul207, xhi31, add207);
__m512 hi214 = _mm512_permutex2var_ps(mul208, xhi31, add208);
__m512 hi215 = _mm512_permutex2var_ps(mul209, xhi31, add209);
__m512 hi216 = _mm512_permutex2var_ps(mul210, xhi31, add210);
_mm512_storeu_ps(mas32+(ptrdiff_t)64*0+(ptrdiff_t)640*i188, lo206);
_mm512_storeu_ps(mas32+(ptrdiff_t)64*1+(ptrdiff_t)640*i188, hi212);
_mm512_storeu_ps(mas32+(ptrdiff_t)64*2+(ptrdiff_t)640*i188, lo207);
_mm512_storeu_ps(mas32+(ptrdiff_t)64*3+(ptrdiff_t)640*i188, hi213);
_mm512_storeu_ps(mas32+(ptrdiff_t)64*4+(ptrdiff_t)640*i188, lo208);
_mm512_storeu_ps(mas32+(ptrdiff_t)64*5+(ptrdiff_t)640*i188, hi214);
_mm512_storeu_ps(mas32+(ptrdiff_t)64*6+(ptrdiff_t)640*i188, lo209);
_mm512_storeu_ps(mas32+(ptrdiff_t)64*7+(ptrdiff_t)640*i188, hi215);
_mm512_storeu_ps(mas32+(ptrdiff_t)64*8+(ptrdiff_t)640*i188, lo210);
_mm512_storeu_ps(mas32+(ptrdiff_t)64*9+(ptrdiff_t)640*i188, hi216);
}
__m512 va211 = _mm512_loadu_ps(variances31+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 va212 = _mm512_loadu_ps(variances31+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 va213 = _mm512_loadu_ps(variances31+(ptrdiff_t)16*2+(ptrdiff_t)80*12);
__m512 va214 = _mm512_loadu_ps(variances31+(ptrdiff_t)16*3+(ptrdiff_t)80*12);
__m512 rcp211 = DenseNet121Rsqrt1(_mm512_add_ps(eps31, va211));
__m512 rcp212 = DenseNet121Rsqrt1(_mm512_add_ps(eps31, va212));
__m512 rcp213 = DenseNet121Rsqrt1(_mm512_add_ps(eps31, va213));
__m512 rcp214 = DenseNet121Rsqrt1(_mm512_add_ps(eps31, va214));
__m512 sc211 = _mm512_loadu_ps(scales31+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 sc212 = _mm512_loadu_ps(scales31+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 sc213 = _mm512_loadu_ps(scales31+(ptrdiff_t)16*2+(ptrdiff_t)80*12);
__m512 sc214 = _mm512_loadu_ps(scales31+(ptrdiff_t)16*3+(ptrdiff_t)80*12);
__m512 mul211 = _mm512_mul_ps(rcp211, sc211);
__m512 mul212 = _mm512_mul_ps(rcp212, sc212);
__m512 mul213 = _mm512_mul_ps(rcp213, sc213);
__m512 mul214 = _mm512_mul_ps(rcp214, sc214);
__m512 me211 = _mm512_loadu_ps(means31+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 me212 = _mm512_loadu_ps(means31+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 me213 = _mm512_loadu_ps(means31+(ptrdiff_t)16*2+(ptrdiff_t)80*12);
__m512 me214 = _mm512_loadu_ps(means31+(ptrdiff_t)16*3+(ptrdiff_t)80*12);
__m512 sh211 = _mm512_loadu_ps(shifts31+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 sh212 = _mm512_loadu_ps(shifts31+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 sh213 = _mm512_loadu_ps(shifts31+(ptrdiff_t)16*2+(ptrdiff_t)80*12);
__m512 sh214 = _mm512_loadu_ps(shifts31+(ptrdiff_t)16*3+(ptrdiff_t)80*12);
__m512 add211 = _mm512_fnmadd_ps(me211, mul211, sh211);
__m512 add212 = _mm512_fnmadd_ps(me212, mul212, sh212);
__m512 add213 = _mm512_fnmadd_ps(me213, mul213, sh213);
__m512 add214 = _mm512_fnmadd_ps(me214, mul214, sh214);
__m512 lo211 = _mm512_permutex2var_ps(mul211, xlo31, add211);
__m512 lo212 = _mm512_permutex2var_ps(mul212, xlo31, add212);
__m512 lo213 = _mm512_permutex2var_ps(mul213, xlo31, add213);
__m512 lo214 = _mm512_permutex2var_ps(mul214, xlo31, add214);
__m512 hi217 = _mm512_permutex2var_ps(mul211, xhi31, add211);
__m512 hi218 = _mm512_permutex2var_ps(mul212, xhi31, add212);
__m512 hi219 = _mm512_permutex2var_ps(mul213, xhi31, add213);
__m512 hi220 = _mm512_permutex2var_ps(mul214, xhi31, add214);
_mm512_storeu_ps(mas32+(ptrdiff_t)64*0+(ptrdiff_t)640*12, lo211);
_mm512_storeu_ps(mas32+(ptrdiff_t)64*1+(ptrdiff_t)640*12, hi217);
_mm512_storeu_ps(mas32+(ptrdiff_t)64*2+(ptrdiff_t)640*12, lo212);
_mm512_storeu_ps(mas32+(ptrdiff_t)64*3+(ptrdiff_t)640*12, hi218);
_mm512_storeu_ps(mas32+(ptrdiff_t)64*4+(ptrdiff_t)640*12, lo213);
_mm512_storeu_ps(mas32+(ptrdiff_t)64*5+(ptrdiff_t)640*12, hi219);
_mm512_storeu_ps(mas32+(ptrdiff_t)64*6+(ptrdiff_t)640*12, lo214);
_mm512_storeu_ps(mas32+(ptrdiff_t)64*7+(ptrdiff_t)640*12, hi220);
}

static void DenseNet121Glopl1Callee1(DenseNet121ThreaderTask1* task418, int64_t* pt214) {
char** tensors416 = task418->any1;
ptrdiff_t c240 = pt214[0];
char*restrict ptr9 = tensors416[0]+(ptrdiff_t)40960*c240;
char*restrict ptr10 = tensors416[1]+(ptrdiff_t)8*128*c240;
char*restrict ptr11 = tensors416[2]+(ptrdiff_t)512*c240;
__m512 buf1 = _mm512_setzero_ps();
__mmask16 mask3 = 65535;
for (ptrdiff_t i246 = 0; i246 < 64; ++i246) {
__m512 bnMul100 = _mm512_set1_ps(((float*)ptr10+(ptrdiff_t)2*(0+(ptrdiff_t)2*i246))[0]);
__m512 bnAdd100 = _mm512_set1_ps(((float*)ptr10+(ptrdiff_t)2*(0+(ptrdiff_t)2*i246))[1]);
__m512 bnMul101 = _mm512_set1_ps(((float*)ptr10+(ptrdiff_t)2*(1+(ptrdiff_t)2*i246))[0]);
__m512 bnAdd101 = _mm512_set1_ps(((float*)ptr10+(ptrdiff_t)2*(1+(ptrdiff_t)2*i246))[1]);
__m512 acc1 = _mm512_maskz_loadu_ps(65535, ptr9+(ptrdiff_t)0+(ptrdiff_t)640*i246);
__m512 acc2 = _mm512_maskz_loadu_ps(65535, ptr9+(ptrdiff_t)64+(ptrdiff_t)640*i246);
__m512 acc3 = _mm512_maskz_loadu_ps(65535, ptr9+(ptrdiff_t)128+(ptrdiff_t)640*i246);
__m512 acc4 = _mm512_maskz_loadu_ps(1, ptr9+(ptrdiff_t)192+(ptrdiff_t)640*i246);
__m512 acc5 = _mm512_maskz_loadu_ps(65535, ptr9+(ptrdiff_t)320+(ptrdiff_t)640*i246);
__m512 acc6 = _mm512_maskz_loadu_ps(65535, ptr9+(ptrdiff_t)384+(ptrdiff_t)640*i246);
__m512 acc7 = _mm512_maskz_loadu_ps(65535, ptr9+(ptrdiff_t)448+(ptrdiff_t)640*i246);
__m512 acc8 = _mm512_maskz_loadu_ps(1, ptr9+(ptrdiff_t)512+(ptrdiff_t)640*i246);
acc1 = _mm512_fmadd_ps(acc1, bnMul100, bnAdd100);
acc2 = _mm512_fmadd_ps(acc2, bnMul100, bnAdd100);
acc3 = _mm512_fmadd_ps(acc3, bnMul100, bnAdd100);
acc4 = _mm512_fmadd_ps(acc4, bnMul100, bnAdd100);
acc5 = _mm512_fmadd_ps(acc5, bnMul101, bnAdd101);
acc6 = _mm512_fmadd_ps(acc6, bnMul101, bnAdd101);
acc7 = _mm512_fmadd_ps(acc7, bnMul101, bnAdd101);
acc8 = _mm512_fmadd_ps(acc8, bnMul101, bnAdd101);
acc1 = _mm512_max_ps(_mm512_setzero_ps(), acc1);
acc2 = _mm512_max_ps(_mm512_setzero_ps(), acc2);
acc3 = _mm512_max_ps(_mm512_setzero_ps(), acc3);
acc4 = _mm512_max_ps(_mm512_setzero_ps(), acc4);
acc5 = _mm512_max_ps(_mm512_setzero_ps(), acc5);
acc6 = _mm512_max_ps(_mm512_setzero_ps(), acc6);
acc7 = _mm512_max_ps(_mm512_setzero_ps(), acc7);
acc8 = _mm512_max_ps(_mm512_setzero_ps(), acc8);
acc1 = _mm512_mask_add_ps(acc1, 65535, acc1, acc3);
acc5 = _mm512_mask_add_ps(acc5, 65535, acc5, acc7);
acc2 = _mm512_mask_add_ps(acc2, 1, acc2, acc4);
acc6 = _mm512_mask_add_ps(acc6, 1, acc6, acc8);
acc1 = _mm512_mask_add_ps(acc1, 65535, acc1, acc2);
acc5 = _mm512_mask_add_ps(acc5, 65535, acc5, acc6);
__m512i pm1lo1 = _mm512_set_epi32(16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0);
__m512i pm1hi1 = _mm512_set_epi32(17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1);
__m512 hi223 = _mm512_shuffle_f32x4(acc1, acc1, 238);
__m512 hi226 = _mm512_shuffle_f32x4(acc5, acc5, 238);
acc1 = _mm512_mask_add_ps(acc1, 255, acc1, hi223);
acc5 = _mm512_mask_add_ps(acc5, 255, acc5, hi226);
__m512 hi224 = _mm512_shuffle_f32x4(acc1, acc1, 1);
__m512 hi227 = _mm512_shuffle_f32x4(acc5, acc5, 1);
acc1 = _mm512_mask_add_ps(acc1, 15, acc1, hi224);
acc5 = _mm512_mask_add_ps(acc5, 15, acc5, hi227);
__m512 hi225 = _mm512_shuffle_ps(acc1, acc1, 238);
__m512 hi228 = _mm512_shuffle_ps(acc5, acc5, 238);
acc1 = _mm512_mask_add_ps(acc1, 3, acc1, hi225);
acc5 = _mm512_mask_add_ps(acc5, 3, acc5, hi228);
__m512 hi229 = _mm512_permutex2var_ps(acc1, pm1hi1, acc5);
acc1 = _mm512_permutex2var_ps(acc1, pm1lo1, acc5);
acc1 = _mm512_mask_add_ps(acc1, 65535, acc1, hi229);
buf1 = _mm512_mask_mov_ps(buf1, mask3, acc1);
mask3 &= mask3<<2;
if (__builtin_expect(!mask3, 0)) {
mask3 = 65535;
buf1 = _mm512_mul_ps(buf1, _mm512_set1_ps(2.0408163e-02f));
_mm512_mask_storeu_ps(ptr11+(ptrdiff_t)4*((ptrdiff_t)2*i246-14), 65535, buf1);
}
}
}

static void DenseNet121Glopl1(DenseNet121ThreaderTeam1* team221, char** tensors415) {
DenseNet121ThreaderTask1 task419;
task419.callee1 = DenseNet121Glopl1Callee1;
task419.any1 = tensors415;
task419.nd1 = 1;
task419.hull1[0] = 8;
DenseNet121ThreaderDo1(team221, &task419);
}

static void DenseNet121Twopl1Callee1(DenseNet121ThreaderTask1* task64, int64_t* pt37) {
char** tensors62 = task64->any1;
ptrdiff_t b52 = pt37[0];
ptrdiff_t c34 = pt37[1];
char*restrict ptr3 = tensors62[0]+(ptrdiff_t)12544*b52+(ptrdiff_t)37824*c34;
char*restrict ptr4 = tensors62[1]+(ptrdiff_t)3136*b52+(ptrdiff_t)9408*c34;
if (c34 < 42) {
for (ptrdiff_t i42 = 0; i42 < 3; ++i42) {
for (ptrdiff_t j32 = 0; j32 < 28; ++j32) {
for (ptrdiff_t k94 = 0; k94 < 1; ++k94) {
__m512 dat1343 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)0+(ptrdiff_t)12608*i42+(ptrdiff_t)448*j32+(ptrdiff_t)128*k94);
__m512 dat1344 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)64+(ptrdiff_t)12608*i42+(ptrdiff_t)448*j32+(ptrdiff_t)128*k94);
__m512 dat1345 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)224+(ptrdiff_t)12608*i42+(ptrdiff_t)448*j32+(ptrdiff_t)128*k94);
__m512 dat1346 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)288+(ptrdiff_t)12608*i42+(ptrdiff_t)448*j32+(ptrdiff_t)128*k94);
dat1343 = _mm512_add_ps(dat1343, dat1345);
dat1344 = _mm512_add_ps(dat1344, dat1346);
__m512i pmLo1 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmHi1 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 hi46 = _mm512_permutex2var_ps(dat1343, pmHi1, dat1344);
dat1343 = _mm512_permutex2var_ps(dat1343, pmLo1, dat1344);
dat1343 = _mm512_add_ps(dat1343, hi46);
dat1343 = _mm512_mul_ps(dat1343, _mm512_set1_ps(2.5e-01f));
_mm512_mask_storeu_ps(ptr4+(ptrdiff_t)0+(ptrdiff_t)3136*i42+(ptrdiff_t)112*j32+(ptrdiff_t)64*k94, 65535, dat1343);
}
__m512 dat1347 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)0+(ptrdiff_t)12608*i42+(ptrdiff_t)448*j32+(ptrdiff_t)128*1);
__m512 dat1348 = _mm512_maskz_loadu_ps(255, ptr3+(ptrdiff_t)64+(ptrdiff_t)12608*i42+(ptrdiff_t)448*j32+(ptrdiff_t)128*1);
__m512 dat1349 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)224+(ptrdiff_t)12608*i42+(ptrdiff_t)448*j32+(ptrdiff_t)128*1);
__m512 dat1350 = _mm512_maskz_loadu_ps(255, ptr3+(ptrdiff_t)288+(ptrdiff_t)12608*i42+(ptrdiff_t)448*j32+(ptrdiff_t)128*1);
dat1347 = _mm512_add_ps(dat1347, dat1349);
dat1348 = _mm512_add_ps(dat1348, dat1350);
__m512i pmLo2 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmHi2 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 hi47 = _mm512_permutex2var_ps(dat1347, pmHi2, dat1348);
dat1347 = _mm512_permutex2var_ps(dat1347, pmLo2, dat1348);
dat1347 = _mm512_add_ps(dat1347, hi47);
dat1347 = _mm512_mul_ps(dat1347, _mm512_set1_ps(2.5e-01f));
_mm512_mask_storeu_ps(ptr4+(ptrdiff_t)0+(ptrdiff_t)3136*i42+(ptrdiff_t)112*j32+(ptrdiff_t)64*1, 4095, dat1347);
}
}
return;
}
for (ptrdiff_t i43 = 0; i43 < 2; ++i43) {
for (ptrdiff_t j33 = 0; j33 < 28; ++j33) {
for (ptrdiff_t k95 = 0; k95 < 1; ++k95) {
__m512 dat1351 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)0+(ptrdiff_t)12608*i43+(ptrdiff_t)448*j33+(ptrdiff_t)128*k95);
__m512 dat1352 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)64+(ptrdiff_t)12608*i43+(ptrdiff_t)448*j33+(ptrdiff_t)128*k95);
__m512 dat1353 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)224+(ptrdiff_t)12608*i43+(ptrdiff_t)448*j33+(ptrdiff_t)128*k95);
__m512 dat1354 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)288+(ptrdiff_t)12608*i43+(ptrdiff_t)448*j33+(ptrdiff_t)128*k95);
dat1351 = _mm512_add_ps(dat1351, dat1353);
dat1352 = _mm512_add_ps(dat1352, dat1354);
__m512i pmLo3 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmHi3 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 hi48 = _mm512_permutex2var_ps(dat1351, pmHi3, dat1352);
dat1351 = _mm512_permutex2var_ps(dat1351, pmLo3, dat1352);
dat1351 = _mm512_add_ps(dat1351, hi48);
dat1351 = _mm512_mul_ps(dat1351, _mm512_set1_ps(2.5e-01f));
_mm512_mask_storeu_ps(ptr4+(ptrdiff_t)0+(ptrdiff_t)3136*i43+(ptrdiff_t)112*j33+(ptrdiff_t)64*k95, 65535, dat1351);
}
__m512 dat1355 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)0+(ptrdiff_t)12608*i43+(ptrdiff_t)448*j33+(ptrdiff_t)128*1);
__m512 dat1356 = _mm512_maskz_loadu_ps(255, ptr3+(ptrdiff_t)64+(ptrdiff_t)12608*i43+(ptrdiff_t)448*j33+(ptrdiff_t)128*1);
__m512 dat1357 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)224+(ptrdiff_t)12608*i43+(ptrdiff_t)448*j33+(ptrdiff_t)128*1);
__m512 dat1358 = _mm512_maskz_loadu_ps(255, ptr3+(ptrdiff_t)288+(ptrdiff_t)12608*i43+(ptrdiff_t)448*j33+(ptrdiff_t)128*1);
dat1355 = _mm512_add_ps(dat1355, dat1357);
dat1356 = _mm512_add_ps(dat1356, dat1358);
__m512i pmLo4 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmHi4 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 hi49 = _mm512_permutex2var_ps(dat1355, pmHi4, dat1356);
dat1355 = _mm512_permutex2var_ps(dat1355, pmLo4, dat1356);
dat1355 = _mm512_add_ps(dat1355, hi49);
dat1355 = _mm512_mul_ps(dat1355, _mm512_set1_ps(2.5e-01f));
_mm512_mask_storeu_ps(ptr4+(ptrdiff_t)0+(ptrdiff_t)3136*i43+(ptrdiff_t)112*j33+(ptrdiff_t)64*1, 4095, dat1355);
}
}
}

static void DenseNet121Twopl1(DenseNet121ThreaderTeam1* team44, char** tensors61) {
DenseNet121ThreaderTask1 task65;
task65.callee1 = DenseNet121Twopl1Callee1;
task65.any1 = tensors61;
task65.nd1 = 2;
task65.hull1[0] = 1;
task65.hull1[1] = 43;
DenseNet121ThreaderDo1(team44, &task65);
}

static void DenseNet121Twopl2Callee1(DenseNet121ThreaderTask1* task152, int64_t* pt81) {
char** tensors150 = task152->any1;
ptrdiff_t b69 = pt81[0];
ptrdiff_t c88 = pt81[1];
char*restrict ptr5 = tensors150[0]+(ptrdiff_t)3136*b69+(ptrdiff_t)31360*c88;
char*restrict ptr6 = tensors150[1]+(ptrdiff_t)784*b69+(ptrdiff_t)8320*c88;
if (c88 < 25) {
for (ptrdiff_t i95 = 0; i95 < 10; ++i95) {
for (ptrdiff_t j77 = 0; j77 < 14; ++j77) {
__m512 dat1780 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)0+(ptrdiff_t)3136*i95+(ptrdiff_t)224*j77+(ptrdiff_t)128*0);
__m512 dat1781 = _mm512_maskz_loadu_ps(4095, ptr5+(ptrdiff_t)64+(ptrdiff_t)3136*i95+(ptrdiff_t)224*j77+(ptrdiff_t)128*0);
__m512 dat1782 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)112+(ptrdiff_t)3136*i95+(ptrdiff_t)224*j77+(ptrdiff_t)128*0);
__m512 dat1783 = _mm512_maskz_loadu_ps(4095, ptr5+(ptrdiff_t)176+(ptrdiff_t)3136*i95+(ptrdiff_t)224*j77+(ptrdiff_t)128*0);
dat1780 = _mm512_add_ps(dat1780, dat1782);
dat1781 = _mm512_add_ps(dat1781, dat1783);
__m512i pmLo5 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmHi5 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 hi105 = _mm512_permutex2var_ps(dat1780, pmHi5, dat1781);
dat1780 = _mm512_permutex2var_ps(dat1780, pmLo5, dat1781);
dat1780 = _mm512_add_ps(dat1780, hi105);
dat1780 = _mm512_mul_ps(dat1780, _mm512_set1_ps(2.5e-01f));
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)0+(ptrdiff_t)832*i95+(ptrdiff_t)56*j77+(ptrdiff_t)64*0, 16383, dat1780);
}
}
return;
}
for (ptrdiff_t i96 = 0; i96 < 6; ++i96) {
for (ptrdiff_t j78 = 0; j78 < 14; ++j78) {
__m512 dat1784 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)0+(ptrdiff_t)3136*i96+(ptrdiff_t)224*j78+(ptrdiff_t)128*0);
__m512 dat1785 = _mm512_maskz_loadu_ps(4095, ptr5+(ptrdiff_t)64+(ptrdiff_t)3136*i96+(ptrdiff_t)224*j78+(ptrdiff_t)128*0);
__m512 dat1786 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)112+(ptrdiff_t)3136*i96+(ptrdiff_t)224*j78+(ptrdiff_t)128*0);
__m512 dat1787 = _mm512_maskz_loadu_ps(4095, ptr5+(ptrdiff_t)176+(ptrdiff_t)3136*i96+(ptrdiff_t)224*j78+(ptrdiff_t)128*0);
dat1784 = _mm512_add_ps(dat1784, dat1786);
dat1785 = _mm512_add_ps(dat1785, dat1787);
__m512i pmLo6 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmHi6 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 hi106 = _mm512_permutex2var_ps(dat1784, pmHi6, dat1785);
dat1784 = _mm512_permutex2var_ps(dat1784, pmLo6, dat1785);
dat1784 = _mm512_add_ps(dat1784, hi106);
dat1784 = _mm512_mul_ps(dat1784, _mm512_set1_ps(2.5e-01f));
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)0+(ptrdiff_t)832*i96+(ptrdiff_t)56*j78+(ptrdiff_t)64*0, 16383, dat1784);
}
}
}

static void DenseNet121Twopl2(DenseNet121ThreaderTeam1* team88, char** tensors149) {
DenseNet121ThreaderTask1 task153;
task153.callee1 = DenseNet121Twopl2Callee1;
task153.any1 = tensors149;
task153.nd1 = 2;
task153.hull1[0] = 1;
task153.hull1[1] = 26;
DenseNet121ThreaderDo1(team88, &task153);
}

static void DenseNet121Twopl3Callee1(DenseNet121ThreaderTask1* task312, int64_t* pt161) {
char** tensors310 = task312->any1;
ptrdiff_t b98 = pt161[0];
ptrdiff_t c190 = pt161[1];
char*restrict ptr7 = tensors310[0]+(ptrdiff_t)784*b98+(ptrdiff_t)30784*c190;
char*restrict ptr8 = tensors310[1]+(ptrdiff_t)196*b98+(ptrdiff_t)11840*c190;
if (c190 < 13) {
for (ptrdiff_t i192 = 0; i192 < 37; ++i192) {
for (ptrdiff_t j158 = 0; j158 < 7; ++j158) {
__m512 dat2226 = _mm512_maskz_loadu_ps(16383, ptr7+(ptrdiff_t)0+(ptrdiff_t)832*i192+(ptrdiff_t)112*j158+(ptrdiff_t)128*0);
__m512 dat2227 = _mm512_maskz_loadu_ps(16383, ptr7+(ptrdiff_t)56+(ptrdiff_t)832*i192+(ptrdiff_t)112*j158+(ptrdiff_t)128*0);
dat2226 = _mm512_add_ps(dat2226, dat2227);
__m512i pmLo7 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmHi7 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 hi221 = _mm512_permutexvar_ps(pmHi7, dat2226);
dat2226 = _mm512_permutexvar_ps(pmLo7, dat2226);
dat2226 = _mm512_add_ps(dat2226, hi221);
dat2226 = _mm512_mul_ps(dat2226, _mm512_set1_ps(2.5e-01f));
_mm512_mask_storeu_ps(ptr8+(ptrdiff_t)0+(ptrdiff_t)320*i192+(ptrdiff_t)28*j158+(ptrdiff_t)64*0, 127, dat2226);
}
}
return;
}
for (ptrdiff_t i193 = 0; i193 < 31; ++i193) {
for (ptrdiff_t j159 = 0; j159 < 7; ++j159) {
__m512 dat2228 = _mm512_maskz_loadu_ps(16383, ptr7+(ptrdiff_t)0+(ptrdiff_t)832*i193+(ptrdiff_t)112*j159+(ptrdiff_t)128*0);
__m512 dat2229 = _mm512_maskz_loadu_ps(16383, ptr7+(ptrdiff_t)56+(ptrdiff_t)832*i193+(ptrdiff_t)112*j159+(ptrdiff_t)128*0);
dat2228 = _mm512_add_ps(dat2228, dat2229);
__m512i pmLo8 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmHi8 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 hi222 = _mm512_permutexvar_ps(pmHi8, dat2228);
dat2228 = _mm512_permutexvar_ps(pmLo8, dat2228);
dat2228 = _mm512_add_ps(dat2228, hi222);
dat2228 = _mm512_mul_ps(dat2228, _mm512_set1_ps(2.5e-01f));
_mm512_mask_storeu_ps(ptr8+(ptrdiff_t)0+(ptrdiff_t)320*i193+(ptrdiff_t)28*j159+(ptrdiff_t)64*0, 127, dat2228);
}
}
}

static void DenseNet121Twopl3(DenseNet121ThreaderTeam1* team168, char** tensors309) {
DenseNet121ThreaderTask1 task313;
task313.callee1 = DenseNet121Twopl3Callee1;
task313.any1 = tensors309;
task313.nd1 = 2;
task313.hull1[0] = 1;
task313.hull1[1] = 14;
DenseNet121ThreaderDo1(team168, &task313);
}

static void DenseNet121Thrpl1Callee1(DenseNet121ThreaderTask1* task12, int64_t* pt11) {
char** tensors10 = task12->any1;
ptrdiff_t b43 = pt11[0];
ptrdiff_t e5 = pt11[1];
ptrdiff_t c4 = pt11[2];
char*restrict ptr1 = tensors10[0]-(ptrdiff_t)448+(ptrdiff_t)50176*b43+(ptrdiff_t)448*e5+(ptrdiff_t)50240*c4;
char*restrict ptr2 = tensors10[1]+(ptrdiff_t)12544*b43+(ptrdiff_t)224*e5+(ptrdiff_t)12608*c4;
for (ptrdiff_t i10 = 0; i10 < 1; ++i10) {
__m512 in1 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*0);
__m512 in2 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)512+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*0);
__m512 dat894 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*0);
__m512 dat895 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)960+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*0);
in1 = _mm512_max_ps(in1, dat894);
in2 = _mm512_max_ps(in2, dat895);
__m512i pm57 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pm58 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512i pm59 = _mm512_set_epi32(29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, 31);
__m512 out1 = _mm512_permutex2var_ps(in1, pm57, in2);
__m512 pack263 = _mm512_permutex2var_ps(in1, pm58, in2);
__m512 pack264 = _mm512_permutex2var_ps(in1, pm59, in2);
out1 = _mm512_mask_max_ps(out1, 65535, out1, pack263);
out1 = _mm512_mask_max_ps(out1, 65534, out1, pack264);
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*0+(ptrdiff_t)64*0, 65535, out1);
for (ptrdiff_t k44 = 1; k44 < 3; ++k44) {
__m512 in3 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*k44);
__m512 in4 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)512+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*k44);
__m512 dat896 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*k44);
__m512 dat897 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)960+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*k44);
in3 = _mm512_max_ps(in3, dat896);
in4 = _mm512_max_ps(in4, dat897);
__m512 blend1 = _mm512_mask_mov_ps(in4, 32768, in2);
__m512 out2 = _mm512_permutex2var_ps(in3, pm57, in4);
__m512 pack265 = _mm512_permutex2var_ps(in3, pm58, in4);
__m512 pack266 = _mm512_permutex2var_ps(in3, pm59, blend1);
out2 = _mm512_mask_max_ps(out2, 65535, out2, pack265);
out2 = _mm512_mask_max_ps(out2, 65535, out2, pack266);
in2 = in4;
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*0+(ptrdiff_t)64*k44, 65535, out2);
}
__m512 in5 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*3);
__m512 dat898 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*3);
in5 = _mm512_max_ps(in5, dat898);
__m512 blend2 = _mm512_mask_mov_ps(in5, 32768, in2);
__m512 out3 = _mm512_permutexvar_ps(pm57, in5);
__m512 pack267 = _mm512_permutexvar_ps(pm58, in5);
__m512 pack268 = _mm512_permutexvar_ps(pm59, blend2);
out3 = _mm512_mask_max_ps(out3, 255, out3, pack267);
out3 = _mm512_mask_max_ps(out3, 255, out3, pack268);
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*0+(ptrdiff_t)64*3, 255, out3);
for (ptrdiff_t j6 = 1; j6 < 56; ++j6) {
__m512 in6 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)0+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 in7 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)64+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 dat899 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 dat901 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)512+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 dat900 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 dat902 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)960+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
in6 = _mm512_max_ps(in6, dat899);
in7 = _mm512_max_ps(in7, dat901);
in6 = _mm512_max_ps(in6, dat900);
in7 = _mm512_max_ps(in7, dat902);
__m512i pm60 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pm61 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512i pm62 = _mm512_set_epi32(29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, 31);
__m512 out4 = _mm512_permutex2var_ps(in6, pm60, in7);
__m512 pack269 = _mm512_permutex2var_ps(in6, pm61, in7);
__m512 pack270 = _mm512_permutex2var_ps(in6, pm62, in7);
out4 = _mm512_mask_max_ps(out4, 65535, out4, pack269);
out4 = _mm512_mask_max_ps(out4, 65534, out4, pack270);
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*j6+(ptrdiff_t)64*0, 65535, out4);
for (ptrdiff_t k45 = 1; k45 < 3; ++k45) {
__m512 in8 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)0+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 in9 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)64+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 dat903 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 dat905 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)512+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 dat904 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 dat906 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)960+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
in8 = _mm512_max_ps(in8, dat903);
in9 = _mm512_max_ps(in9, dat905);
in8 = _mm512_max_ps(in8, dat904);
in9 = _mm512_max_ps(in9, dat906);
__m512 blend3 = _mm512_mask_mov_ps(in9, 32768, in7);
__m512 out5 = _mm512_permutex2var_ps(in8, pm60, in9);
__m512 pack271 = _mm512_permutex2var_ps(in8, pm61, in9);
__m512 pack272 = _mm512_permutex2var_ps(in8, pm62, blend3);
out5 = _mm512_mask_max_ps(out5, 65535, out5, pack271);
out5 = _mm512_mask_max_ps(out5, 65535, out5, pack272);
in7 = in9;
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*j6+(ptrdiff_t)64*k45, 65535, out5);
}
__m512 in10 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)0+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*3);
__m512 dat907 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*3);
__m512 dat908 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*3);
in10 = _mm512_max_ps(in10, dat907);
in10 = _mm512_max_ps(in10, dat908);
__m512 blend4 = _mm512_mask_mov_ps(in10, 32768, in7);
__m512 out6 = _mm512_permutexvar_ps(pm60, in10);
__m512 pack273 = _mm512_permutexvar_ps(pm61, in10);
__m512 pack274 = _mm512_permutexvar_ps(pm62, blend4);
out6 = _mm512_mask_max_ps(out6, 255, out6, pack273);
out6 = _mm512_mask_max_ps(out6, 255, out6, pack274);
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*j6+(ptrdiff_t)64*3, 255, out6);
}
}
}

static void DenseNet121Thrpl1(DenseNet121ThreaderTeam1* team18, char** tensors9) {
DenseNet121ThreaderTask1 task13;
task13.callee1 = DenseNet121Thrpl1Callee1;
task13.any1 = tensors9;
task13.nd1 = 3;
task13.hull1[0] = 1;
task13.hull1[1] = 1;
task13.hull1[2] = 64;
DenseNet121ThreaderDo1(team18, &task13);
}

static void DenseNet121FcArrange1Callee1(DenseNet121ThreaderTask1* task420, int64_t* pt215) {
char** tensors418 = task420->any1;
ptrdiff_t t35 = pt215[0];
char*restrict weights1 = tensors418[0]+(ptrdiff_t)65536*t35;
char*restrict biases1 = tensors418[1]+(ptrdiff_t)64*t35;
char*restrict weights2 = tensors418[2]+(ptrdiff_t)32768*t35;
char*restrict biases2 = tensors418[2]+(ptrdiff_t)2048000+(ptrdiff_t)64*t35;
if (t35 < 62) {
for (ptrdiff_t i247 = 0; i247 < 1; ++i247) {
for (ptrdiff_t j212 = 0; j212 < 64; ++j212) {
__m512 wtLo1 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)0+(ptrdiff_t)65536*i247+(ptrdiff_t)64*j212);
__m512 wtHi1 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)4096+(ptrdiff_t)65536*i247+(ptrdiff_t)64*j212);
__m512 wtLo2 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)8192+(ptrdiff_t)65536*i247+(ptrdiff_t)64*j212);
__m512 wtHi2 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)12288+(ptrdiff_t)65536*i247+(ptrdiff_t)64*j212);
__m512 wtLo3 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)16384+(ptrdiff_t)65536*i247+(ptrdiff_t)64*j212);
__m512 wtHi3 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)20480+(ptrdiff_t)65536*i247+(ptrdiff_t)64*j212);
__m512 wtLo4 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)24576+(ptrdiff_t)65536*i247+(ptrdiff_t)64*j212);
__m512 wtHi4 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)28672+(ptrdiff_t)65536*i247+(ptrdiff_t)64*j212);
__m512 wtLo5 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)32768+(ptrdiff_t)65536*i247+(ptrdiff_t)64*j212);
__m512 wtHi5 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)36864+(ptrdiff_t)65536*i247+(ptrdiff_t)64*j212);
__m512 wtLo6 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)40960+(ptrdiff_t)65536*i247+(ptrdiff_t)64*j212);
__m512 wtHi6 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)45056+(ptrdiff_t)65536*i247+(ptrdiff_t)64*j212);
__m512 wtLo7 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)49152+(ptrdiff_t)65536*i247+(ptrdiff_t)64*j212);
__m512 wtHi7 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)53248+(ptrdiff_t)65536*i247+(ptrdiff_t)64*j212);
__m512 wtLo8 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)57344+(ptrdiff_t)65536*i247+(ptrdiff_t)64*j212);
__m512 wtHi8 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)61440+(ptrdiff_t)65536*i247+(ptrdiff_t)64*j212);
__m256i halfLo1 = _mm512_cvtps_ph(wtLo1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi1 = _mm512_cvtps_ph(wtHi1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo2 = _mm512_cvtps_ph(wtLo2, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi2 = _mm512_cvtps_ph(wtHi2, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo3 = _mm512_cvtps_ph(wtLo3, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi3 = _mm512_cvtps_ph(wtHi3, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo4 = _mm512_cvtps_ph(wtLo4, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi4 = _mm512_cvtps_ph(wtHi4, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo5 = _mm512_cvtps_ph(wtLo5, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi5 = _mm512_cvtps_ph(wtHi5, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo6 = _mm512_cvtps_ph(wtLo6, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi6 = _mm512_cvtps_ph(wtHi6, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo7 = _mm512_cvtps_ph(wtLo7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi7 = _mm512_cvtps_ph(wtHi7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo8 = _mm512_cvtps_ph(wtLo8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi8 = _mm512_cvtps_ph(wtHi8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512i yield1 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo1), halfHi1, 1);
__m512i yield2 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo2), halfHi2, 1);
__m512i yield3 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo3), halfHi3, 1);
__m512i yield4 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo4), halfHi4, 1);
__m512i yield5 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo5), halfHi5, 1);
__m512i yield6 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo6), halfHi6, 1);
__m512i yield7 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo7), halfHi7, 1);
__m512i yield8 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo8), halfHi8, 1);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)0+(ptrdiff_t)32768*i247+(ptrdiff_t)512*j212, 65535, yield1);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)64+(ptrdiff_t)32768*i247+(ptrdiff_t)512*j212, 65535, yield2);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)128+(ptrdiff_t)32768*i247+(ptrdiff_t)512*j212, 65535, yield3);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)192+(ptrdiff_t)32768*i247+(ptrdiff_t)512*j212, 65535, yield4);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)256+(ptrdiff_t)32768*i247+(ptrdiff_t)512*j212, 65535, yield5);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)320+(ptrdiff_t)32768*i247+(ptrdiff_t)512*j212, 65535, yield6);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)384+(ptrdiff_t)32768*i247+(ptrdiff_t)512*j212, 65535, yield7);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)448+(ptrdiff_t)32768*i247+(ptrdiff_t)512*j212, 65535, yield8);
}
__m512 bias6 = _mm512_maskz_loadu_ps(65535, biases1+(ptrdiff_t)0+(ptrdiff_t)64*i247);
_mm512_mask_storeu_ps(biases2+(ptrdiff_t)0+(ptrdiff_t)64*i247, 65535, bias6);
}
return;
}
for (ptrdiff_t i248 = 0; i248 < 1; ++i248) {
for (ptrdiff_t j213 = 0; j213 < 32; ++j213) {
__m512 wtLo9 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)0+(ptrdiff_t)32768*i248+(ptrdiff_t)128*j213);
__m512 wtHi9 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)4096+(ptrdiff_t)32768*i248+(ptrdiff_t)128*j213);
__m512 wtLo10 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)8192+(ptrdiff_t)32768*i248+(ptrdiff_t)128*j213);
__m512 wtHi10 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)12288+(ptrdiff_t)32768*i248+(ptrdiff_t)128*j213);
__m512 wtLo11 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)16384+(ptrdiff_t)32768*i248+(ptrdiff_t)128*j213);
__m512 wtHi11 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)20480+(ptrdiff_t)32768*i248+(ptrdiff_t)128*j213);
__m512 wtLo12 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)24576+(ptrdiff_t)32768*i248+(ptrdiff_t)128*j213);
__m512 wtHi12 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)28672+(ptrdiff_t)32768*i248+(ptrdiff_t)128*j213);
__m512 wtLo13 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)64+(ptrdiff_t)32768*i248+(ptrdiff_t)128*j213);
__m512 wtHi13 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)4160+(ptrdiff_t)32768*i248+(ptrdiff_t)128*j213);
__m512 wtLo14 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)8256+(ptrdiff_t)32768*i248+(ptrdiff_t)128*j213);
__m512 wtHi14 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)12352+(ptrdiff_t)32768*i248+(ptrdiff_t)128*j213);
__m512 wtLo15 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)16448+(ptrdiff_t)32768*i248+(ptrdiff_t)128*j213);
__m512 wtHi15 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)20544+(ptrdiff_t)32768*i248+(ptrdiff_t)128*j213);
__m512 wtLo16 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)24640+(ptrdiff_t)32768*i248+(ptrdiff_t)128*j213);
__m512 wtHi16 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)28736+(ptrdiff_t)32768*i248+(ptrdiff_t)128*j213);
__m256i halfLo9 = _mm512_cvtps_ph(wtLo9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi9 = _mm512_cvtps_ph(wtHi9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo10 = _mm512_cvtps_ph(wtLo10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi10 = _mm512_cvtps_ph(wtHi10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo11 = _mm512_cvtps_ph(wtLo11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi11 = _mm512_cvtps_ph(wtHi11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo12 = _mm512_cvtps_ph(wtLo12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi12 = _mm512_cvtps_ph(wtHi12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo13 = _mm512_cvtps_ph(wtLo13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi13 = _mm512_cvtps_ph(wtHi13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo14 = _mm512_cvtps_ph(wtLo14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi14 = _mm512_cvtps_ph(wtHi14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo15 = _mm512_cvtps_ph(wtLo15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi15 = _mm512_cvtps_ph(wtHi15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo16 = _mm512_cvtps_ph(wtLo16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi16 = _mm512_cvtps_ph(wtHi16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512i yield9 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo9), halfHi9, 1);
__m512i yield10 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo10), halfHi10, 1);
__m512i yield11 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo11), halfHi11, 1);
__m512i yield12 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo12), halfHi12, 1);
__m512i yield13 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo13), halfHi13, 1);
__m512i yield14 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo14), halfHi14, 1);
__m512i yield15 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo15), halfHi15, 1);
__m512i yield16 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo16), halfHi16, 1);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)0+(ptrdiff_t)32768*i248+(ptrdiff_t)512*j213, 65535, yield9);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)64+(ptrdiff_t)32768*i248+(ptrdiff_t)512*j213, 65535, yield10);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)128+(ptrdiff_t)32768*i248+(ptrdiff_t)512*j213, 65535, yield11);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)192+(ptrdiff_t)32768*i248+(ptrdiff_t)512*j213, 65535, yield12);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)256+(ptrdiff_t)32768*i248+(ptrdiff_t)512*j213, 65535, yield13);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)320+(ptrdiff_t)32768*i248+(ptrdiff_t)512*j213, 65535, yield14);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)384+(ptrdiff_t)32768*i248+(ptrdiff_t)512*j213, 65535, yield15);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)448+(ptrdiff_t)32768*i248+(ptrdiff_t)512*j213, 65535, yield16);
}
__m512 bias7 = _mm512_maskz_loadu_ps(255, biases1+(ptrdiff_t)0+(ptrdiff_t)32*i248);
_mm512_mask_storeu_ps(biases2+(ptrdiff_t)0+(ptrdiff_t)32*i248, 255, bias7);
}
}

static void DenseNet121FcArrange1(DenseNet121ThreaderTeam1* team222, char** tensors417) {
DenseNet121ThreaderTask1 task421;
task421.callee1 = DenseNet121FcArrange1Callee1;
task421.any1 = tensors417;
task421.nd1 = 1;
task421.hull1[0] = 63;
DenseNet121ThreaderDo1(team222, &task421);
}

static void DenseNet121FcApply1Callee1(DenseNet121ThreaderTask1* task422, int64_t* pt216) {
char** tensors420 = task422->any1;
ptrdiff_t t36 = pt216[0];
char*restrict wtPtr67 = tensors420[0]+(ptrdiff_t)32768*t36;
char*restrict biasPtr67 = tensors420[0]+(ptrdiff_t)2048000+(ptrdiff_t)64*t36;
char*restrict datPtr133 = tensors420[1];
char*restrict datPtr134 = tensors420[2]+(ptrdiff_t)64*t36;
if (t36 < 62) {
for (ptrdiff_t i249 = 0; i249 < 1; ++i249) {
__m512 sum2571 = _mm512_setzero_ps();
__m512 sum2572 = _mm512_setzero_ps();
__m512 sum2573 = _mm512_setzero_ps();
__m512 sum2574 = _mm512_setzero_ps();
__m512 sum2575 = _mm512_setzero_ps();
__m512 sum2576 = _mm512_setzero_ps();
__m512 sum2577 = _mm512_setzero_ps();
__m512 sum2578 = _mm512_setzero_ps();
__m512 sum2579 = _mm512_setzero_ps();
__m512 sum2580 = _mm512_setzero_ps();
__m512 sum2581 = _mm512_setzero_ps();
__m512 sum2582 = _mm512_setzero_ps();
__m512 sum2583 = _mm512_setzero_ps();
__m512 sum2584 = _mm512_setzero_ps();
__m512 sum2585 = _mm512_setzero_ps();
__m512 sum2586 = _mm512_setzero_ps();
for (ptrdiff_t j214 = 0; j214 < 64; ++j214) {
__m512i wts1 = _mm512_maskz_loadu_epi32(65535, wtPtr67+(ptrdiff_t)0+(ptrdiff_t)32768*i249+(ptrdiff_t)512*j214);
__m512 dat2431 = _mm512_maskz_loadu_ps(65535, datPtr133+(ptrdiff_t)0+(ptrdiff_t)64*j214);
__m512i wts2 = _mm512_maskz_loadu_epi32(65535, wtPtr67+(ptrdiff_t)64+(ptrdiff_t)32768*i249+(ptrdiff_t)512*j214);
__m512i wts3 = _mm512_maskz_loadu_epi32(65535, wtPtr67+(ptrdiff_t)128+(ptrdiff_t)32768*i249+(ptrdiff_t)512*j214);
__m512i wts4 = _mm512_maskz_loadu_epi32(65535, wtPtr67+(ptrdiff_t)192+(ptrdiff_t)32768*i249+(ptrdiff_t)512*j214);
__m512 wtLo17 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts1));
__m512 wtHi17 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts1, 1));
__m512 wtLo18 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts2));
__m512 wtHi18 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts2, 1));
__m512 wtLo19 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts3));
__m512 wtHi19 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts3, 1));
__m512 wtLo20 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts4));
__m512 wtHi20 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts4, 1));
sum2571 = _mm512_fmadd_ps(wtLo17, dat2431, sum2571);
sum2572 = _mm512_fmadd_ps(wtHi17, dat2431, sum2572);
sum2573 = _mm512_fmadd_ps(wtLo18, dat2431, sum2573);
sum2574 = _mm512_fmadd_ps(wtHi18, dat2431, sum2574);
sum2575 = _mm512_fmadd_ps(wtLo19, dat2431, sum2575);
sum2576 = _mm512_fmadd_ps(wtHi19, dat2431, sum2576);
sum2577 = _mm512_fmadd_ps(wtLo20, dat2431, sum2577);
sum2578 = _mm512_fmadd_ps(wtHi20, dat2431, sum2578);
__m512i wts5 = _mm512_maskz_loadu_epi32(65535, wtPtr67+(ptrdiff_t)256+(ptrdiff_t)32768*i249+(ptrdiff_t)512*j214);
__m512i wts6 = _mm512_maskz_loadu_epi32(65535, wtPtr67+(ptrdiff_t)320+(ptrdiff_t)32768*i249+(ptrdiff_t)512*j214);
__m512i wts7 = _mm512_maskz_loadu_epi32(65535, wtPtr67+(ptrdiff_t)384+(ptrdiff_t)32768*i249+(ptrdiff_t)512*j214);
__m512i wts8 = _mm512_maskz_loadu_epi32(65535, wtPtr67+(ptrdiff_t)448+(ptrdiff_t)32768*i249+(ptrdiff_t)512*j214);
__m512 wtLo21 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts5));
__m512 wtHi21 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts5, 1));
__m512 wtLo22 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts6));
__m512 wtHi22 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts6, 1));
__m512 wtLo23 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts7));
__m512 wtHi23 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts7, 1));
__m512 wtLo24 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts8));
__m512 wtHi24 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts8, 1));
sum2579 = _mm512_fmadd_ps(wtLo21, dat2431, sum2579);
sum2580 = _mm512_fmadd_ps(wtHi21, dat2431, sum2580);
sum2581 = _mm512_fmadd_ps(wtLo22, dat2431, sum2581);
sum2582 = _mm512_fmadd_ps(wtHi22, dat2431, sum2582);
sum2583 = _mm512_fmadd_ps(wtLo23, dat2431, sum2583);
sum2584 = _mm512_fmadd_ps(wtHi23, dat2431, sum2584);
sum2585 = _mm512_fmadd_ps(wtLo24, dat2431, sum2585);
sum2586 = _mm512_fmadd_ps(wtHi24, dat2431, sum2586);
}
__m512 bias8 = _mm512_maskz_loadu_ps(65535, biasPtr67+(ptrdiff_t)0+(ptrdiff_t)64*i249);
__m512i pm1Lo1 = _mm512_set_epi32(30, 14, 28, 12, 26, 10, 24, 8, 22, 6, 20, 4, 18, 2, 16, 0);
__m512i pm1Hi1 = _mm512_set_epi32(31, 15, 29, 13, 27, 11, 25, 9, 23, 7, 21, 5, 19, 3, 17, 1);
__m512i pm4Lo1 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
__m512i pm4Hi1 = _mm512_set_epi32(31, 30, 29, 28, 15, 14, 13, 12, 23, 22, 21, 20, 7, 6, 5, 4);
__m512 upper4 = _mm512_shuffle_f32x4(sum2571, sum2579, 238);
__m512 upper5 = _mm512_shuffle_f32x4(sum2575, sum2583, 238);
sum2571 = _mm512_shuffle_f32x4(sum2571, sum2579, 68);
sum2575 = _mm512_shuffle_f32x4(sum2575, sum2583, 68);
sum2571 = _mm512_add_ps(sum2571, upper4);
sum2575 = _mm512_add_ps(sum2575, upper5);
__m512 upper7 = _mm512_shuffle_f32x4(sum2573, sum2581, 238);
__m512 upper8 = _mm512_shuffle_f32x4(sum2577, sum2585, 238);
sum2573 = _mm512_shuffle_f32x4(sum2573, sum2581, 68);
sum2577 = _mm512_shuffle_f32x4(sum2577, sum2585, 68);
sum2573 = _mm512_add_ps(sum2573, upper7);
sum2577 = _mm512_add_ps(sum2577, upper8);
__m512 upper3 = _mm512_permutex2var_ps(sum2571, pm4Hi1, sum2575);
__m512 upper6 = _mm512_permutex2var_ps(sum2573, pm4Hi1, sum2577);
sum2571 = _mm512_permutex2var_ps(sum2571, pm4Lo1, sum2575);
sum2573 = _mm512_permutex2var_ps(sum2573, pm4Lo1, sum2577);
sum2571 = _mm512_add_ps(sum2571, upper3);
sum2573 = _mm512_add_ps(sum2573, upper6);
__m512 upper11 = _mm512_shuffle_f32x4(sum2572, sum2580, 238);
__m512 upper12 = _mm512_shuffle_f32x4(sum2576, sum2584, 238);
sum2572 = _mm512_shuffle_f32x4(sum2572, sum2580, 68);
sum2576 = _mm512_shuffle_f32x4(sum2576, sum2584, 68);
sum2572 = _mm512_add_ps(sum2572, upper11);
sum2576 = _mm512_add_ps(sum2576, upper12);
__m512 upper14 = _mm512_shuffle_f32x4(sum2574, sum2582, 238);
__m512 upper15 = _mm512_shuffle_f32x4(sum2578, sum2586, 238);
sum2574 = _mm512_shuffle_f32x4(sum2574, sum2582, 68);
sum2578 = _mm512_shuffle_f32x4(sum2578, sum2586, 68);
sum2574 = _mm512_add_ps(sum2574, upper14);
sum2578 = _mm512_add_ps(sum2578, upper15);
__m512 upper10 = _mm512_permutex2var_ps(sum2572, pm4Hi1, sum2576);
__m512 upper13 = _mm512_permutex2var_ps(sum2574, pm4Hi1, sum2578);
sum2572 = _mm512_permutex2var_ps(sum2572, pm4Lo1, sum2576);
sum2574 = _mm512_permutex2var_ps(sum2574, pm4Lo1, sum2578);
sum2572 = _mm512_add_ps(sum2572, upper10);
sum2574 = _mm512_add_ps(sum2574, upper13);
__m512 upper2 = _mm512_shuffle_ps(sum2571, sum2573, 238);
__m512 upper9 = _mm512_shuffle_ps(sum2572, sum2574, 238);
sum2571 = _mm512_shuffle_ps(sum2571, sum2573, 68);
sum2572 = _mm512_shuffle_ps(sum2572, sum2574, 68);
sum2571 = _mm512_add_ps(sum2571, upper2);
sum2572 = _mm512_add_ps(sum2572, upper9);
__m512 upper1 = _mm512_permutex2var_ps(sum2571, pm1Hi1, sum2572);
sum2571 = _mm512_permutex2var_ps(sum2571, pm1Lo1, sum2572);
sum2571 = _mm512_add_ps(sum2571, upper1);
sum2571 = _mm512_add_ps(sum2571, bias8);
_mm512_mask_storeu_ps(datPtr134+(ptrdiff_t)0+(ptrdiff_t)64*i249, 65535, sum2571);
}
return;
}
for (ptrdiff_t i250 = 0; i250 < 1; ++i250) {
__m512 sum2587 = _mm512_setzero_ps();
__m512 sum2588 = _mm512_setzero_ps();
__m512 sum2589 = _mm512_setzero_ps();
__m512 sum2590 = _mm512_setzero_ps();
__m512 sum2591 = _mm512_setzero_ps();
__m512 sum2592 = _mm512_setzero_ps();
__m512 sum2593 = _mm512_setzero_ps();
__m512 sum2594 = _mm512_setzero_ps();
for (ptrdiff_t j215 = 0; j215 < 64; ++j215) {
__m512i wts9 = _mm512_maskz_loadu_epi32(65535, wtPtr67+(ptrdiff_t)0+(ptrdiff_t)32768*i250+(ptrdiff_t)256*j215);
__m512 dat2432 = _mm512_maskz_loadu_ps(65535, datPtr133+(ptrdiff_t)0+(ptrdiff_t)64*j215);
__m512i wts10 = _mm512_maskz_loadu_epi32(65535, wtPtr67+(ptrdiff_t)64+(ptrdiff_t)32768*i250+(ptrdiff_t)256*j215);
__m512i wts11 = _mm512_maskz_loadu_epi32(65535, wtPtr67+(ptrdiff_t)128+(ptrdiff_t)32768*i250+(ptrdiff_t)256*j215);
__m512i wts12 = _mm512_maskz_loadu_epi32(65535, wtPtr67+(ptrdiff_t)192+(ptrdiff_t)32768*i250+(ptrdiff_t)256*j215);
__m512 wtLo25 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts9));
__m512 wtHi25 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts9, 1));
__m512 wtLo26 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts10));
__m512 wtHi26 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts10, 1));
__m512 wtLo27 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts11));
__m512 wtHi27 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts11, 1));
__m512 wtLo28 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts12));
__m512 wtHi28 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts12, 1));
sum2587 = _mm512_fmadd_ps(wtLo25, dat2432, sum2587);
sum2588 = _mm512_fmadd_ps(wtHi25, dat2432, sum2588);
sum2589 = _mm512_fmadd_ps(wtLo26, dat2432, sum2589);
sum2590 = _mm512_fmadd_ps(wtHi26, dat2432, sum2590);
sum2591 = _mm512_fmadd_ps(wtLo27, dat2432, sum2591);
sum2592 = _mm512_fmadd_ps(wtHi27, dat2432, sum2592);
sum2593 = _mm512_fmadd_ps(wtLo28, dat2432, sum2593);
sum2594 = _mm512_fmadd_ps(wtHi28, dat2432, sum2594);
}
__m512 bias9 = _mm512_maskz_loadu_ps(255, biasPtr67+(ptrdiff_t)0+(ptrdiff_t)32*i250);
__m512i pmEven1 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmOdd1 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512i pm4Lo2 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
__m512i pm4Hi2 = _mm512_set_epi32(31, 30, 29, 28, 15, 14, 13, 12, 23, 22, 21, 20, 7, 6, 5, 4);
__m512 upper18 = _mm512_shuffle_f32x4(sum2587, sum2591, 238);
__m512 upper19 = _mm512_shuffle_f32x4(sum2589, sum2593, 238);
sum2587 = _mm512_shuffle_f32x4(sum2587, sum2591, 68);
sum2589 = _mm512_shuffle_f32x4(sum2589, sum2593, 68);
sum2587 = _mm512_add_ps(sum2587, upper18);
sum2589 = _mm512_add_ps(sum2589, upper19);
__m512 upper21 = _mm512_shuffle_f32x4(sum2588, sum2592, 238);
__m512 upper22 = _mm512_shuffle_f32x4(sum2590, sum2594, 238);
sum2588 = _mm512_shuffle_f32x4(sum2588, sum2592, 68);
sum2590 = _mm512_shuffle_f32x4(sum2590, sum2594, 68);
sum2588 = _mm512_add_ps(sum2588, upper21);
sum2590 = _mm512_add_ps(sum2590, upper22);
__m512 upper17 = _mm512_permutex2var_ps(sum2587, pm4Hi2, sum2589);
__m512 upper20 = _mm512_permutex2var_ps(sum2588, pm4Hi2, sum2590);
sum2587 = _mm512_permutex2var_ps(sum2587, pm4Lo2, sum2589);
sum2588 = _mm512_permutex2var_ps(sum2588, pm4Lo2, sum2590);
sum2587 = _mm512_add_ps(sum2587, upper17);
sum2588 = _mm512_add_ps(sum2588, upper20);
__m512 upper16 = _mm512_shuffle_ps(sum2587, sum2588, 238);
sum2587 = _mm512_shuffle_ps(sum2587, sum2588, 68);
sum2587 = _mm512_add_ps(sum2587, upper16);
__m512 upper23 = _mm512_permutexvar_ps(pmOdd1, sum2587);
sum2587 = _mm512_permutexvar_ps(pmEven1, sum2587);
sum2587 = _mm512_add_ps(sum2587, upper23);
sum2587 = _mm512_add_ps(sum2587, bias9);
_mm512_mask_storeu_ps(datPtr134+(ptrdiff_t)0+(ptrdiff_t)32*i250, 255, sum2587);
}
}

static void DenseNet121FcApply1(DenseNet121ThreaderTeam1* team223, char** tensors419) {
DenseNet121ThreaderTask1 task423;
task423.callee1 = DenseNet121FcApply1Callee1;
task423.any1 = tensors419;
task423.nd1 = 1;
task423.hull1[0] = 63;
DenseNet121ThreaderDo1(team223, &task423);
}

static void DenseNet121OneArrangeWts1Callee1(DenseNet121ThreaderTask1* task14, int64_t* pt12) {
char** tensors12 = task14->any1;
(void)pt12;
char*restrict wtPtr2 = tensors12[0]+(ptrdiff_t)3340*0+(ptrdiff_t)32768*0;
char*restrict biasPtr2 = tensors12[1]+(ptrdiff_t)512*0;
char*restrict bnPtr2 = tensors12[2]+(ptrdiff_t)8*128*0;
char*restrict arranged1 = tensors12[3]+(ptrdiff_t)428032*0+(ptrdiff_t)33280*0;
ptrdiff_t ii1 = 1;
for (ptrdiff_t i12 = 0; i12 < ii1; ++i12) {
ptrdiff_t j7 = 0;
ptrdiff_t jj19 = j7+8;
for (; j7 < jj19; ++j7) {
if (j7 < 7) {
ptrdiff_t k47 = 0+16*(j7-0);
ptrdiff_t l10 = (size_t)(0+k47)/6;
ptrdiff_t cut2 = (size_t)(0+k47)%6;
switch (cut2) {
case 0:;
case 2: {
__m512 sum3 = _mm512_maskz_loadu_ps(65535, biasPtr2+512*i12+4*k47);
__m512i pmMul2 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd2 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo1 = _mm512_loadu_ps(bnPtr2+(ptrdiff_t)8*(k47+128*i12));
__m512 masHi1 = _mm512_maskz_loadu_ps(65535, bnPtr2+(ptrdiff_t)8*(k47+128*i12)+(ptrdiff_t)64);
__m512 postMul5 = _mm512_permutex2var_ps(masLo1, pmMul2, masHi1);
__m512 postAdd3 = _mm512_permutex2var_ps(masLo1, pmAdd2, masHi1);
sum3 = _mm512_fmadd_ps(sum3, postMul5, postAdd3);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*0+(ptrdiff_t)0, 63>>cut2, sum3);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*0+(ptrdiff_t)1536, 4032>>cut2, sum3);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*0+(ptrdiff_t)3072, 65535-(4095>>cut2), sum3);
ptrdiff_t c6 = 0;
for (; c6 != 4; ++c6) {
__m512 wt31 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c6+(ptrdiff_t)0);
__m512 wt32 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c6+(ptrdiff_t)256);
__m512 wt33 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c6+(ptrdiff_t)512);
__m512 wt34 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c6+(ptrdiff_t)768);
__m512 wt35 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c6+(ptrdiff_t)1024);
__m512 wt36 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c6+(ptrdiff_t)1280);
__m512 wt37 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c6+(ptrdiff_t)1536);
__m512 wt38 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c6+(ptrdiff_t)1792);
__m512 wt39 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c6+(ptrdiff_t)2048);
__m512 wt40 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c6+(ptrdiff_t)2304);
__m512 wt41 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c6+(ptrdiff_t)2560);
__m512 wt42 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c6+(ptrdiff_t)2816);
__m512 wt43 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c6+(ptrdiff_t)3072);
__m512 wt44 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c6+(ptrdiff_t)3328);
__m512 wt45 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c6+(ptrdiff_t)3584);
__m512 wt46 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c6+(ptrdiff_t)3840);
__m512 tmp1 = _mm512_unpacklo_ps(wt31, wt32);
__m512 tmp2 = _mm512_unpackhi_ps(wt31, wt32);
__m512 tmp3 = _mm512_unpacklo_ps(wt33, wt34);
__m512 tmp4 = _mm512_unpackhi_ps(wt33, wt34);
__m512 tmp5 = _mm512_unpacklo_ps(wt35, wt36);
__m512 tmp6 = _mm512_unpackhi_ps(wt35, wt36);
__m512 tmp7 = _mm512_unpacklo_ps(wt37, wt38);
__m512 tmp8 = _mm512_unpackhi_ps(wt37, wt38);
__m512 tmp9 = _mm512_unpacklo_ps(wt39, wt40);
__m512 tmp10 = _mm512_unpackhi_ps(wt39, wt40);
__m512 tmp11 = _mm512_unpacklo_ps(wt41, wt42);
__m512 tmp12 = _mm512_unpackhi_ps(wt41, wt42);
__m512 tmp13 = _mm512_unpacklo_ps(wt43, wt44);
__m512 tmp14 = _mm512_unpackhi_ps(wt43, wt44);
__m512 tmp15 = _mm512_unpacklo_ps(wt45, wt46);
__m512 tmp16 = _mm512_unpackhi_ps(wt45, wt46);
__m512 tmp17 = _mm512_shuffle_ps(tmp1, tmp3, 68);
__m512 tmp18 = _mm512_shuffle_ps(tmp1, tmp3, 238);
__m512 tmp19 = _mm512_shuffle_ps(tmp2, tmp4, 68);
__m512 tmp20 = _mm512_shuffle_ps(tmp2, tmp4, 238);
__m512 tmp21 = _mm512_shuffle_ps(tmp5, tmp7, 68);
__m512 tmp22 = _mm512_shuffle_ps(tmp5, tmp7, 238);
__m512 tmp23 = _mm512_shuffle_ps(tmp6, tmp8, 68);
__m512 tmp24 = _mm512_shuffle_ps(tmp6, tmp8, 238);
__m512 tmp25 = _mm512_shuffle_ps(tmp9, tmp11, 68);
__m512 tmp26 = _mm512_shuffle_ps(tmp9, tmp11, 238);
__m512 tmp27 = _mm512_shuffle_ps(tmp10, tmp12, 68);
__m512 tmp28 = _mm512_shuffle_ps(tmp10, tmp12, 238);
__m512 tmp29 = _mm512_shuffle_ps(tmp13, tmp15, 68);
__m512 tmp30 = _mm512_shuffle_ps(tmp13, tmp15, 238);
__m512 tmp31 = _mm512_shuffle_ps(tmp14, tmp16, 68);
__m512 tmp32 = _mm512_shuffle_ps(tmp14, tmp16, 238);
__m512 tmp33 = _mm512_shuffle_f32x4(tmp17, tmp21, 136);
__m512 tmp34 = _mm512_shuffle_f32x4(tmp17, tmp21, 221);
__m512 tmp35 = _mm512_shuffle_f32x4(tmp18, tmp22, 136);
__m512 tmp36 = _mm512_shuffle_f32x4(tmp18, tmp22, 221);
__m512 tmp37 = _mm512_shuffle_f32x4(tmp19, tmp23, 136);
__m512 tmp38 = _mm512_shuffle_f32x4(tmp19, tmp23, 221);
__m512 tmp39 = _mm512_shuffle_f32x4(tmp20, tmp24, 136);
__m512 tmp40 = _mm512_shuffle_f32x4(tmp20, tmp24, 221);
__m512 tmp41 = _mm512_shuffle_f32x4(tmp25, tmp29, 136);
__m512 tmp42 = _mm512_shuffle_f32x4(tmp25, tmp29, 221);
__m512 tmp43 = _mm512_shuffle_f32x4(tmp26, tmp30, 136);
__m512 tmp44 = _mm512_shuffle_f32x4(tmp26, tmp30, 221);
__m512 tmp45 = _mm512_shuffle_f32x4(tmp27, tmp31, 136);
__m512 tmp46 = _mm512_shuffle_f32x4(tmp27, tmp31, 221);
__m512 tmp47 = _mm512_shuffle_f32x4(tmp28, tmp32, 136);
__m512 tmp48 = _mm512_shuffle_f32x4(tmp28, tmp32, 221);
wt31 = _mm512_shuffle_f32x4(tmp33, tmp41, 136);
wt39 = _mm512_shuffle_f32x4(tmp33, tmp41, 221);
wt32 = _mm512_shuffle_f32x4(tmp35, tmp43, 136);
wt40 = _mm512_shuffle_f32x4(tmp35, tmp43, 221);
wt33 = _mm512_shuffle_f32x4(tmp37, tmp45, 136);
wt41 = _mm512_shuffle_f32x4(tmp37, tmp45, 221);
wt34 = _mm512_shuffle_f32x4(tmp39, tmp47, 136);
wt42 = _mm512_shuffle_f32x4(tmp39, tmp47, 221);
wt35 = _mm512_shuffle_f32x4(tmp34, tmp42, 136);
wt43 = _mm512_shuffle_f32x4(tmp34, tmp42, 221);
wt36 = _mm512_shuffle_f32x4(tmp36, tmp44, 136);
wt44 = _mm512_shuffle_f32x4(tmp36, tmp44, 221);
wt37 = _mm512_shuffle_f32x4(tmp38, tmp46, 136);
wt45 = _mm512_shuffle_f32x4(tmp38, tmp46, 221);
wt38 = _mm512_shuffle_f32x4(tmp40, tmp48, 136);
wt46 = _mm512_shuffle_f32x4(tmp40, tmp48, 221);
wt31 = _mm512_mul_ps(wt31, postMul5);
wt32 = _mm512_mul_ps(wt32, postMul5);
wt33 = _mm512_mul_ps(wt33, postMul5);
wt34 = _mm512_mul_ps(wt34, postMul5);
wt35 = _mm512_mul_ps(wt35, postMul5);
wt36 = _mm512_mul_ps(wt36, postMul5);
wt37 = _mm512_mul_ps(wt37, postMul5);
wt38 = _mm512_mul_ps(wt38, postMul5);
wt39 = _mm512_mul_ps(wt39, postMul5);
wt40 = _mm512_mul_ps(wt40, postMul5);
wt41 = _mm512_mul_ps(wt41, postMul5);
wt42 = _mm512_mul_ps(wt42, postMul5);
wt43 = _mm512_mul_ps(wt43, postMul5);
wt44 = _mm512_mul_ps(wt44, postMul5);
wt45 = _mm512_mul_ps(wt45, postMul5);
wt46 = _mm512_mul_ps(wt46, postMul5);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(1+16*c6)+(ptrdiff_t)0, 63>>cut2, wt31);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(2+16*c6)+(ptrdiff_t)0, 63>>cut2, wt32);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(3+16*c6)+(ptrdiff_t)0, 63>>cut2, wt33);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(4+16*c6)+(ptrdiff_t)0, 63>>cut2, wt34);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(5+16*c6)+(ptrdiff_t)0, 63>>cut2, wt35);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(6+16*c6)+(ptrdiff_t)0, 63>>cut2, wt36);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(7+16*c6)+(ptrdiff_t)0, 63>>cut2, wt37);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(8+16*c6)+(ptrdiff_t)0, 63>>cut2, wt38);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(9+16*c6)+(ptrdiff_t)0, 63>>cut2, wt39);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(10+16*c6)+(ptrdiff_t)0, 63>>cut2, wt40);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(11+16*c6)+(ptrdiff_t)0, 63>>cut2, wt41);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(12+16*c6)+(ptrdiff_t)0, 63>>cut2, wt42);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(13+16*c6)+(ptrdiff_t)0, 63>>cut2, wt43);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(14+16*c6)+(ptrdiff_t)0, 63>>cut2, wt44);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(15+16*c6)+(ptrdiff_t)0, 63>>cut2, wt45);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(16+16*c6)+(ptrdiff_t)0, 63>>cut2, wt46);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(1+16*c6)+(ptrdiff_t)1536, 4032>>cut2, wt31);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(2+16*c6)+(ptrdiff_t)1536, 4032>>cut2, wt32);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(3+16*c6)+(ptrdiff_t)1536, 4032>>cut2, wt33);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(4+16*c6)+(ptrdiff_t)1536, 4032>>cut2, wt34);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(5+16*c6)+(ptrdiff_t)1536, 4032>>cut2, wt35);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(6+16*c6)+(ptrdiff_t)1536, 4032>>cut2, wt36);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(7+16*c6)+(ptrdiff_t)1536, 4032>>cut2, wt37);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(8+16*c6)+(ptrdiff_t)1536, 4032>>cut2, wt38);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(9+16*c6)+(ptrdiff_t)1536, 4032>>cut2, wt39);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(10+16*c6)+(ptrdiff_t)1536, 4032>>cut2, wt40);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(11+16*c6)+(ptrdiff_t)1536, 4032>>cut2, wt41);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(12+16*c6)+(ptrdiff_t)1536, 4032>>cut2, wt42);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(13+16*c6)+(ptrdiff_t)1536, 4032>>cut2, wt43);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(14+16*c6)+(ptrdiff_t)1536, 4032>>cut2, wt44);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(15+16*c6)+(ptrdiff_t)1536, 4032>>cut2, wt45);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(16+16*c6)+(ptrdiff_t)1536, 4032>>cut2, wt46);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(1+16*c6)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt31);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(2+16*c6)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt32);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(3+16*c6)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt33);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(4+16*c6)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt34);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(5+16*c6)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt35);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(6+16*c6)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt36);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(7+16*c6)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt37);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(8+16*c6)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt38);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(9+16*c6)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt39);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(10+16*c6)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt40);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(11+16*c6)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt41);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(12+16*c6)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt42);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(13+16*c6)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt43);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(14+16*c6)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt44);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(15+16*c6)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt45);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(16+16*c6)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt46);
}
break;
}
default: {
cut2 = 4;
__m512 sum4 = _mm512_maskz_loadu_ps(65535, biasPtr2+512*i12+4*k47);
__m512i pmMul3 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd3 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo2 = _mm512_loadu_ps(bnPtr2+(ptrdiff_t)8*(k47+128*i12));
__m512 masHi2 = _mm512_maskz_loadu_ps(65535, bnPtr2+(ptrdiff_t)8*(k47+128*i12)+(ptrdiff_t)64);
__m512 postMul6 = _mm512_permutex2var_ps(masLo2, pmMul3, masHi2);
__m512 postAdd4 = _mm512_permutex2var_ps(masLo2, pmAdd3, masHi2);
sum4 = _mm512_fmadd_ps(sum4, postMul6, postAdd4);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*0+(ptrdiff_t)0, 63>>cut2, sum4);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*0+(ptrdiff_t)1536, 4032>>cut2, sum4);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*0+(ptrdiff_t)3072, 258048>>cut2, sum4);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*0+(ptrdiff_t)4608, 65535-(262143>>cut2), sum4);
ptrdiff_t c7 = 0;
for (; c7 != 4; ++c7) {
__m512 wt47 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c7+(ptrdiff_t)0);
__m512 wt48 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c7+(ptrdiff_t)256);
__m512 wt49 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c7+(ptrdiff_t)512);
__m512 wt50 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c7+(ptrdiff_t)768);
__m512 wt51 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c7+(ptrdiff_t)1024);
__m512 wt52 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c7+(ptrdiff_t)1280);
__m512 wt53 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c7+(ptrdiff_t)1536);
__m512 wt54 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c7+(ptrdiff_t)1792);
__m512 wt55 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c7+(ptrdiff_t)2048);
__m512 wt56 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c7+(ptrdiff_t)2304);
__m512 wt57 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c7+(ptrdiff_t)2560);
__m512 wt58 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c7+(ptrdiff_t)2816);
__m512 wt59 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c7+(ptrdiff_t)3072);
__m512 wt60 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c7+(ptrdiff_t)3328);
__m512 wt61 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c7+(ptrdiff_t)3584);
__m512 wt62 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k47+64*c7+(ptrdiff_t)3840);
__m512 tmp49 = _mm512_unpacklo_ps(wt47, wt48);
__m512 tmp50 = _mm512_unpackhi_ps(wt47, wt48);
__m512 tmp51 = _mm512_unpacklo_ps(wt49, wt50);
__m512 tmp52 = _mm512_unpackhi_ps(wt49, wt50);
__m512 tmp53 = _mm512_unpacklo_ps(wt51, wt52);
__m512 tmp54 = _mm512_unpackhi_ps(wt51, wt52);
__m512 tmp55 = _mm512_unpacklo_ps(wt53, wt54);
__m512 tmp56 = _mm512_unpackhi_ps(wt53, wt54);
__m512 tmp57 = _mm512_unpacklo_ps(wt55, wt56);
__m512 tmp58 = _mm512_unpackhi_ps(wt55, wt56);
__m512 tmp59 = _mm512_unpacklo_ps(wt57, wt58);
__m512 tmp60 = _mm512_unpackhi_ps(wt57, wt58);
__m512 tmp61 = _mm512_unpacklo_ps(wt59, wt60);
__m512 tmp62 = _mm512_unpackhi_ps(wt59, wt60);
__m512 tmp63 = _mm512_unpacklo_ps(wt61, wt62);
__m512 tmp64 = _mm512_unpackhi_ps(wt61, wt62);
__m512 tmp65 = _mm512_shuffle_ps(tmp49, tmp51, 68);
__m512 tmp66 = _mm512_shuffle_ps(tmp49, tmp51, 238);
__m512 tmp67 = _mm512_shuffle_ps(tmp50, tmp52, 68);
__m512 tmp68 = _mm512_shuffle_ps(tmp50, tmp52, 238);
__m512 tmp69 = _mm512_shuffle_ps(tmp53, tmp55, 68);
__m512 tmp70 = _mm512_shuffle_ps(tmp53, tmp55, 238);
__m512 tmp71 = _mm512_shuffle_ps(tmp54, tmp56, 68);
__m512 tmp72 = _mm512_shuffle_ps(tmp54, tmp56, 238);
__m512 tmp73 = _mm512_shuffle_ps(tmp57, tmp59, 68);
__m512 tmp74 = _mm512_shuffle_ps(tmp57, tmp59, 238);
__m512 tmp75 = _mm512_shuffle_ps(tmp58, tmp60, 68);
__m512 tmp76 = _mm512_shuffle_ps(tmp58, tmp60, 238);
__m512 tmp77 = _mm512_shuffle_ps(tmp61, tmp63, 68);
__m512 tmp78 = _mm512_shuffle_ps(tmp61, tmp63, 238);
__m512 tmp79 = _mm512_shuffle_ps(tmp62, tmp64, 68);
__m512 tmp80 = _mm512_shuffle_ps(tmp62, tmp64, 238);
__m512 tmp81 = _mm512_shuffle_f32x4(tmp65, tmp69, 136);
__m512 tmp82 = _mm512_shuffle_f32x4(tmp65, tmp69, 221);
__m512 tmp83 = _mm512_shuffle_f32x4(tmp66, tmp70, 136);
__m512 tmp84 = _mm512_shuffle_f32x4(tmp66, tmp70, 221);
__m512 tmp85 = _mm512_shuffle_f32x4(tmp67, tmp71, 136);
__m512 tmp86 = _mm512_shuffle_f32x4(tmp67, tmp71, 221);
__m512 tmp87 = _mm512_shuffle_f32x4(tmp68, tmp72, 136);
__m512 tmp88 = _mm512_shuffle_f32x4(tmp68, tmp72, 221);
__m512 tmp89 = _mm512_shuffle_f32x4(tmp73, tmp77, 136);
__m512 tmp90 = _mm512_shuffle_f32x4(tmp73, tmp77, 221);
__m512 tmp91 = _mm512_shuffle_f32x4(tmp74, tmp78, 136);
__m512 tmp92 = _mm512_shuffle_f32x4(tmp74, tmp78, 221);
__m512 tmp93 = _mm512_shuffle_f32x4(tmp75, tmp79, 136);
__m512 tmp94 = _mm512_shuffle_f32x4(tmp75, tmp79, 221);
__m512 tmp95 = _mm512_shuffle_f32x4(tmp76, tmp80, 136);
__m512 tmp96 = _mm512_shuffle_f32x4(tmp76, tmp80, 221);
wt47 = _mm512_shuffle_f32x4(tmp81, tmp89, 136);
wt55 = _mm512_shuffle_f32x4(tmp81, tmp89, 221);
wt48 = _mm512_shuffle_f32x4(tmp83, tmp91, 136);
wt56 = _mm512_shuffle_f32x4(tmp83, tmp91, 221);
wt49 = _mm512_shuffle_f32x4(tmp85, tmp93, 136);
wt57 = _mm512_shuffle_f32x4(tmp85, tmp93, 221);
wt50 = _mm512_shuffle_f32x4(tmp87, tmp95, 136);
wt58 = _mm512_shuffle_f32x4(tmp87, tmp95, 221);
wt51 = _mm512_shuffle_f32x4(tmp82, tmp90, 136);
wt59 = _mm512_shuffle_f32x4(tmp82, tmp90, 221);
wt52 = _mm512_shuffle_f32x4(tmp84, tmp92, 136);
wt60 = _mm512_shuffle_f32x4(tmp84, tmp92, 221);
wt53 = _mm512_shuffle_f32x4(tmp86, tmp94, 136);
wt61 = _mm512_shuffle_f32x4(tmp86, tmp94, 221);
wt54 = _mm512_shuffle_f32x4(tmp88, tmp96, 136);
wt62 = _mm512_shuffle_f32x4(tmp88, tmp96, 221);
wt47 = _mm512_mul_ps(wt47, postMul6);
wt48 = _mm512_mul_ps(wt48, postMul6);
wt49 = _mm512_mul_ps(wt49, postMul6);
wt50 = _mm512_mul_ps(wt50, postMul6);
wt51 = _mm512_mul_ps(wt51, postMul6);
wt52 = _mm512_mul_ps(wt52, postMul6);
wt53 = _mm512_mul_ps(wt53, postMul6);
wt54 = _mm512_mul_ps(wt54, postMul6);
wt55 = _mm512_mul_ps(wt55, postMul6);
wt56 = _mm512_mul_ps(wt56, postMul6);
wt57 = _mm512_mul_ps(wt57, postMul6);
wt58 = _mm512_mul_ps(wt58, postMul6);
wt59 = _mm512_mul_ps(wt59, postMul6);
wt60 = _mm512_mul_ps(wt60, postMul6);
wt61 = _mm512_mul_ps(wt61, postMul6);
wt62 = _mm512_mul_ps(wt62, postMul6);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(1+16*c7)+(ptrdiff_t)0, 63>>cut2, wt47);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(2+16*c7)+(ptrdiff_t)0, 63>>cut2, wt48);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(3+16*c7)+(ptrdiff_t)0, 63>>cut2, wt49);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(4+16*c7)+(ptrdiff_t)0, 63>>cut2, wt50);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(5+16*c7)+(ptrdiff_t)0, 63>>cut2, wt51);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(6+16*c7)+(ptrdiff_t)0, 63>>cut2, wt52);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(7+16*c7)+(ptrdiff_t)0, 63>>cut2, wt53);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(8+16*c7)+(ptrdiff_t)0, 63>>cut2, wt54);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(9+16*c7)+(ptrdiff_t)0, 63>>cut2, wt55);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(10+16*c7)+(ptrdiff_t)0, 63>>cut2, wt56);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(11+16*c7)+(ptrdiff_t)0, 63>>cut2, wt57);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(12+16*c7)+(ptrdiff_t)0, 63>>cut2, wt58);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(13+16*c7)+(ptrdiff_t)0, 63>>cut2, wt59);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(14+16*c7)+(ptrdiff_t)0, 63>>cut2, wt60);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(15+16*c7)+(ptrdiff_t)0, 63>>cut2, wt61);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(16+16*c7)+(ptrdiff_t)0, 63>>cut2, wt62);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(1+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt47);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(2+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt48);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(3+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt49);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(4+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt50);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(5+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt51);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(6+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt52);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(7+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt53);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(8+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt54);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(9+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt55);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(10+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt56);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(11+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt57);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(12+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt58);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(13+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt59);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(14+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt60);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(15+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt61);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(16+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt62);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(1+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt47);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(2+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt48);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(3+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt49);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(4+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt50);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(5+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt51);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(6+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt52);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(7+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt53);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(8+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt54);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(9+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt55);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(10+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt56);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(11+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt57);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(12+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt58);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(13+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt59);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(14+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt60);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(15+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt61);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(16+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt62);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(1+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt47);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(2+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt48);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(3+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt49);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(4+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt50);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(5+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt51);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(6+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt52);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(7+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt53);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(8+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt54);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(9+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt55);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(10+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt56);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(11+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt57);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(12+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt58);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(13+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt59);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(14+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt60);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(15+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt61);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l10+4*cut2+24*(16+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt62);
}
}
}
} else {
ptrdiff_t k46 = 112;
ptrdiff_t l9 = (size_t)(0+k46)/6;
ptrdiff_t cut1 = (size_t)(0+k46)%6;
__m512 sum2 = _mm512_maskz_loadu_ps(65535, biasPtr2+512*i12+4*k46);
__m512i pmMul4 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd4 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo3 = _mm512_loadu_ps(bnPtr2+(ptrdiff_t)8*(k46+128*i12));
__m512 masHi3 = _mm512_maskz_loadu_ps(65535, bnPtr2+(ptrdiff_t)8*(k46+128*i12)+(ptrdiff_t)64);
__m512 postMul4 = _mm512_permutex2var_ps(masLo3, pmMul4, masHi3);
__m512 postAdd2 = _mm512_permutex2var_ps(masLo3, pmAdd4, masHi3);
sum2 = _mm512_fmadd_ps(sum2, postMul4, postAdd2);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*0+(ptrdiff_t)0, 63>>cut1, sum2);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*0+(ptrdiff_t)1536, 4032>>cut1, sum2);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*0+(ptrdiff_t)3072, 258048>>cut1, sum2);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+8*0+(ptrdiff_t)4608, 65535-(262143>>cut1), sum2);
ptrdiff_t c5 = 0;
for (; c5 != 4; ++c5) {
__m512 wt15 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k46+64*c5+(ptrdiff_t)0);
__m512 wt16 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k46+64*c5+(ptrdiff_t)256);
__m512 wt17 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k46+64*c5+(ptrdiff_t)512);
__m512 wt18 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k46+64*c5+(ptrdiff_t)768);
__m512 wt19 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k46+64*c5+(ptrdiff_t)1024);
__m512 wt20 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k46+64*c5+(ptrdiff_t)1280);
__m512 wt21 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k46+64*c5+(ptrdiff_t)1536);
__m512 wt22 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k46+64*c5+(ptrdiff_t)1792);
__m512 wt23 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k46+64*c5+(ptrdiff_t)2048);
__m512 wt24 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k46+64*c5+(ptrdiff_t)2304);
__m512 wt25 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k46+64*c5+(ptrdiff_t)2560);
__m512 wt26 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k46+64*c5+(ptrdiff_t)2816);
__m512 wt27 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k46+64*c5+(ptrdiff_t)3072);
__m512 wt28 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k46+64*c5+(ptrdiff_t)3328);
__m512 wt29 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k46+64*c5+(ptrdiff_t)3584);
__m512 wt30 = _mm512_maskz_loadu_ps(65535, wtPtr2+32768*i12+256*k46+64*c5+(ptrdiff_t)3840);
__m512 tmp97 = _mm512_unpacklo_ps(wt15, wt16);
__m512 tmp98 = _mm512_unpackhi_ps(wt15, wt16);
__m512 tmp99 = _mm512_unpacklo_ps(wt17, wt18);
__m512 tmp100 = _mm512_unpackhi_ps(wt17, wt18);
__m512 tmp101 = _mm512_unpacklo_ps(wt19, wt20);
__m512 tmp102 = _mm512_unpackhi_ps(wt19, wt20);
__m512 tmp103 = _mm512_unpacklo_ps(wt21, wt22);
__m512 tmp104 = _mm512_unpackhi_ps(wt21, wt22);
__m512 tmp105 = _mm512_unpacklo_ps(wt23, wt24);
__m512 tmp106 = _mm512_unpackhi_ps(wt23, wt24);
__m512 tmp107 = _mm512_unpacklo_ps(wt25, wt26);
__m512 tmp108 = _mm512_unpackhi_ps(wt25, wt26);
__m512 tmp109 = _mm512_unpacklo_ps(wt27, wt28);
__m512 tmp110 = _mm512_unpackhi_ps(wt27, wt28);
__m512 tmp111 = _mm512_unpacklo_ps(wt29, wt30);
__m512 tmp112 = _mm512_unpackhi_ps(wt29, wt30);
__m512 tmp113 = _mm512_shuffle_ps(tmp97, tmp99, 68);
__m512 tmp114 = _mm512_shuffle_ps(tmp97, tmp99, 238);
__m512 tmp115 = _mm512_shuffle_ps(tmp98, tmp100, 68);
__m512 tmp116 = _mm512_shuffle_ps(tmp98, tmp100, 238);
__m512 tmp117 = _mm512_shuffle_ps(tmp101, tmp103, 68);
__m512 tmp118 = _mm512_shuffle_ps(tmp101, tmp103, 238);
__m512 tmp119 = _mm512_shuffle_ps(tmp102, tmp104, 68);
__m512 tmp120 = _mm512_shuffle_ps(tmp102, tmp104, 238);
__m512 tmp121 = _mm512_shuffle_ps(tmp105, tmp107, 68);
__m512 tmp122 = _mm512_shuffle_ps(tmp105, tmp107, 238);
__m512 tmp123 = _mm512_shuffle_ps(tmp106, tmp108, 68);
__m512 tmp124 = _mm512_shuffle_ps(tmp106, tmp108, 238);
__m512 tmp125 = _mm512_shuffle_ps(tmp109, tmp111, 68);
__m512 tmp126 = _mm512_shuffle_ps(tmp109, tmp111, 238);
__m512 tmp127 = _mm512_shuffle_ps(tmp110, tmp112, 68);
__m512 tmp128 = _mm512_shuffle_ps(tmp110, tmp112, 238);
__m512 tmp129 = _mm512_shuffle_f32x4(tmp113, tmp117, 136);
__m512 tmp130 = _mm512_shuffle_f32x4(tmp113, tmp117, 221);
__m512 tmp131 = _mm512_shuffle_f32x4(tmp114, tmp118, 136);
__m512 tmp132 = _mm512_shuffle_f32x4(tmp114, tmp118, 221);
__m512 tmp133 = _mm512_shuffle_f32x4(tmp115, tmp119, 136);
__m512 tmp134 = _mm512_shuffle_f32x4(tmp115, tmp119, 221);
__m512 tmp135 = _mm512_shuffle_f32x4(tmp116, tmp120, 136);
__m512 tmp136 = _mm512_shuffle_f32x4(tmp116, tmp120, 221);
__m512 tmp137 = _mm512_shuffle_f32x4(tmp121, tmp125, 136);
__m512 tmp138 = _mm512_shuffle_f32x4(tmp121, tmp125, 221);
__m512 tmp139 = _mm512_shuffle_f32x4(tmp122, tmp126, 136);
__m512 tmp140 = _mm512_shuffle_f32x4(tmp122, tmp126, 221);
__m512 tmp141 = _mm512_shuffle_f32x4(tmp123, tmp127, 136);
__m512 tmp142 = _mm512_shuffle_f32x4(tmp123, tmp127, 221);
__m512 tmp143 = _mm512_shuffle_f32x4(tmp124, tmp128, 136);
__m512 tmp144 = _mm512_shuffle_f32x4(tmp124, tmp128, 221);
wt15 = _mm512_shuffle_f32x4(tmp129, tmp137, 136);
wt23 = _mm512_shuffle_f32x4(tmp129, tmp137, 221);
wt16 = _mm512_shuffle_f32x4(tmp131, tmp139, 136);
wt24 = _mm512_shuffle_f32x4(tmp131, tmp139, 221);
wt17 = _mm512_shuffle_f32x4(tmp133, tmp141, 136);
wt25 = _mm512_shuffle_f32x4(tmp133, tmp141, 221);
wt18 = _mm512_shuffle_f32x4(tmp135, tmp143, 136);
wt26 = _mm512_shuffle_f32x4(tmp135, tmp143, 221);
wt19 = _mm512_shuffle_f32x4(tmp130, tmp138, 136);
wt27 = _mm512_shuffle_f32x4(tmp130, tmp138, 221);
wt20 = _mm512_shuffle_f32x4(tmp132, tmp140, 136);
wt28 = _mm512_shuffle_f32x4(tmp132, tmp140, 221);
wt21 = _mm512_shuffle_f32x4(tmp134, tmp142, 136);
wt29 = _mm512_shuffle_f32x4(tmp134, tmp142, 221);
wt22 = _mm512_shuffle_f32x4(tmp136, tmp144, 136);
wt30 = _mm512_shuffle_f32x4(tmp136, tmp144, 221);
wt15 = _mm512_mul_ps(wt15, postMul4);
wt16 = _mm512_mul_ps(wt16, postMul4);
wt17 = _mm512_mul_ps(wt17, postMul4);
wt18 = _mm512_mul_ps(wt18, postMul4);
wt19 = _mm512_mul_ps(wt19, postMul4);
wt20 = _mm512_mul_ps(wt20, postMul4);
wt21 = _mm512_mul_ps(wt21, postMul4);
wt22 = _mm512_mul_ps(wt22, postMul4);
wt23 = _mm512_mul_ps(wt23, postMul4);
wt24 = _mm512_mul_ps(wt24, postMul4);
wt25 = _mm512_mul_ps(wt25, postMul4);
wt26 = _mm512_mul_ps(wt26, postMul4);
wt27 = _mm512_mul_ps(wt27, postMul4);
wt28 = _mm512_mul_ps(wt28, postMul4);
wt29 = _mm512_mul_ps(wt29, postMul4);
wt30 = _mm512_mul_ps(wt30, postMul4);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(1+16*c5)+(ptrdiff_t)0, 63>>cut1, wt15);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(2+16*c5)+(ptrdiff_t)0, 63>>cut1, wt16);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(3+16*c5)+(ptrdiff_t)0, 63>>cut1, wt17);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(4+16*c5)+(ptrdiff_t)0, 63>>cut1, wt18);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(5+16*c5)+(ptrdiff_t)0, 63>>cut1, wt19);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(6+16*c5)+(ptrdiff_t)0, 63>>cut1, wt20);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(7+16*c5)+(ptrdiff_t)0, 63>>cut1, wt21);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(8+16*c5)+(ptrdiff_t)0, 63>>cut1, wt22);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(9+16*c5)+(ptrdiff_t)0, 63>>cut1, wt23);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(10+16*c5)+(ptrdiff_t)0, 63>>cut1, wt24);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(11+16*c5)+(ptrdiff_t)0, 63>>cut1, wt25);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(12+16*c5)+(ptrdiff_t)0, 63>>cut1, wt26);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(13+16*c5)+(ptrdiff_t)0, 63>>cut1, wt27);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(14+16*c5)+(ptrdiff_t)0, 63>>cut1, wt28);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(15+16*c5)+(ptrdiff_t)0, 63>>cut1, wt29);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(16+16*c5)+(ptrdiff_t)0, 63>>cut1, wt30);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(1+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt15);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(2+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt16);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(3+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt17);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(4+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt18);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(5+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt19);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(6+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt20);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(7+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt21);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(8+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt22);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(9+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt23);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(10+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt24);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(11+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt25);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(12+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt26);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(13+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt27);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(14+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt28);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(15+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt29);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(16+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt30);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(1+16*c5)+(ptrdiff_t)3072, 258048>>cut1, wt15);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(2+16*c5)+(ptrdiff_t)3072, 258048>>cut1, wt16);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(3+16*c5)+(ptrdiff_t)3072, 258048>>cut1, wt17);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(4+16*c5)+(ptrdiff_t)3072, 258048>>cut1, wt18);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(5+16*c5)+(ptrdiff_t)3072, 258048>>cut1, wt19);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(6+16*c5)+(ptrdiff_t)3072, 258048>>cut1, wt20);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(7+16*c5)+(ptrdiff_t)3072, 258048>>cut1, wt21);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(8+16*c5)+(ptrdiff_t)3072, 258048>>cut1, wt22);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(9+16*c5)+(ptrdiff_t)3072, 258048>>cut1, wt23);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(10+16*c5)+(ptrdiff_t)3072, 258048>>cut1, wt24);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(11+16*c5)+(ptrdiff_t)3072, 258048>>cut1, wt25);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(12+16*c5)+(ptrdiff_t)3072, 258048>>cut1, wt26);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(13+16*c5)+(ptrdiff_t)3072, 258048>>cut1, wt27);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(14+16*c5)+(ptrdiff_t)3072, 258048>>cut1, wt28);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(15+16*c5)+(ptrdiff_t)3072, 258048>>cut1, wt29);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+24*(16+16*c5)+(ptrdiff_t)3072, 258048>>cut1, wt30);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+8*(1+16*c5)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt15);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+8*(2+16*c5)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt16);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+8*(3+16*c5)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt17);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+8*(4+16*c5)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt18);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+8*(5+16*c5)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt19);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+8*(6+16*c5)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt20);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+8*(7+16*c5)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt21);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+8*(8+16*c5)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt22);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+8*(9+16*c5)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt23);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+8*(10+16*c5)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt24);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+8*(11+16*c5)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt25);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+8*(12+16*c5)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt26);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+8*(13+16*c5)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt27);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+8*(14+16*c5)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt28);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+8*(15+16*c5)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt29);
_mm512_mask_storeu_ps(arranged1+33280*i12+1560*l9+4*cut1+8*(16+16*c5)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt30);
}
}
}
}
}

static void DenseNet121OneArrangeWts1(DenseNet121ThreaderTeam1* team19, char** tensors11) {
DenseNet121ThreaderTask1 task15;
task15.callee1 = DenseNet121OneArrangeWts1Callee1;
task15.any1 = tensors11;
task15.nd1 = 3;
task15.hull1[0] = 1;
task15.hull1[1] = 1;
task15.hull1[2] = 1;
DenseNet121ThreaderDo1(team19, &task15);
}

static void DenseNet121OneArrangeDats1Callee1(DenseNet121ThreaderTask1* task16, int64_t* pt13) {
char** tensors14 = task16->any1;
ptrdiff_t c8 = pt13[1];
char*restrict datPtr3 = tensors14[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)806912*0;
char*restrict bnPtr3 = tensors14[1]+(ptrdiff_t)8*((ptrdiff_t)835*0+(ptrdiff_t)64*0);
char*restrict arranged2 = tensors14[2]+(ptrdiff_t)10474240*0+(ptrdiff_t)802816*0;
ptrdiff_t ii2 = 1;
for (ptrdiff_t i13 = 0; i13 < ii2; ++i13) {
ptrdiff_t j8 = 2*c8;
ptrdiff_t jj20 = j8+(c8 < 23 ? 1 : 2);
for (; j8 != 49; ++j8) {
ptrdiff_t k48 = 0;
ptrdiff_t kk24 = k48+64;
for (; k48 < kk24; ++k48) {
__m512 dat909 = _mm512_maskz_loadu_ps(65535, datPtr3+806912*i13+256*j8+12608*k48+(ptrdiff_t)0);
__m512 dat910 = _mm512_maskz_loadu_ps(65535, datPtr3+806912*i13+256*j8+12608*k48+(ptrdiff_t)64);
__m512 dat911 = _mm512_maskz_loadu_ps(65535, datPtr3+806912*i13+256*j8+12608*k48+(ptrdiff_t)128);
__m512 dat912 = _mm512_maskz_loadu_ps(65535, datPtr3+806912*i13+256*j8+12608*k48+(ptrdiff_t)192);
__m512 bnMul1 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(k48+64*i13))[0]);
__m512 bnAdd1 = _mm512_set1_ps(((float*)bnPtr3+(ptrdiff_t)2*(k48+64*i13))[1]);
dat909 = _mm512_fmadd_ps(dat909, bnMul1, bnAdd1);
dat910 = _mm512_fmadd_ps(dat910, bnMul1, bnAdd1);
dat911 = _mm512_fmadd_ps(dat911, bnMul1, bnAdd1);
dat912 = _mm512_fmadd_ps(dat912, bnMul1, bnAdd1);
dat909 = _mm512_max_ps(_mm512_setzero_ps(), dat909);
dat910 = _mm512_max_ps(_mm512_setzero_ps(), dat910);
dat911 = _mm512_max_ps(_mm512_setzero_ps(), dat911);
dat912 = _mm512_max_ps(_mm512_setzero_ps(), dat912);
_mm512_mask_storeu_ps(arranged2+802816*i13+16384*j8+256*k48+(ptrdiff_t)0, 65535, dat909);
_mm512_mask_storeu_ps(arranged2+802816*i13+16384*j8+256*k48+(ptrdiff_t)64, 65535, dat910);
_mm512_mask_storeu_ps(arranged2+802816*i13+16384*j8+256*k48+(ptrdiff_t)128, 65535, dat911);
_mm512_mask_storeu_ps(arranged2+802816*i13+16384*j8+256*k48+(ptrdiff_t)192, 65535, dat912);
}
if (j8 >= jj20) goto next1;
}
next1:;
}
}

static void DenseNet121OneArrangeDats1(DenseNet121ThreaderTeam1* team20, char** tensors13) {
DenseNet121ThreaderTask1 task17;
task17.callee1 = DenseNet121OneArrangeDats1Callee1;
task17.any1 = tensors13;
task17.nd1 = 4;
task17.hull1[0] = 1;
task17.hull1[1] = 24;
task17.hull1[2] = 1;
task17.hull1[3] = 1;
DenseNet121ThreaderDo1(team20, &task17);
}

static void DenseNet121OneApply1Callee1(DenseNet121ThreaderTask1* task18, int64_t* pt14) {
void** pair2 = task18->any1;
char** tensors16 = pair2[0];
ptrdiff_t e6 = 0;
ptrdiff_t g6 = 0;
ptrdiff_t d3 = pt14[1];
ptrdiff_t w22 = pt14[0];
char*restrict arrangedWts1 = tensors16[0]+428032*e6+(ptrdiff_t)33280*1*g6;
char*restrict arrangedDats1 = tensors16[1]+10474240*e6+(ptrdiff_t)802816*1*g6;
char*restrict datPtr4 = tensors16[2]+(ptrdiff_t)1613824*1*g6;
ptrdiff_t ii3 = 1;
for (ptrdiff_t i14 = 0; i14 < ii3; ++i14) {
ptrdiff_t j9 = 1*d3;
ptrdiff_t jj21 = j9+0;
for (; j9 != 49; ++j9) {
ptrdiff_t k49 = 11*w22;
ptrdiff_t kk25 = k49+10;
for (; k49 != 21; ++k49) {
ptrdiff_t s10 = -1;
__m512 sum5 = _mm512_set1_ps(*(float*)(arrangedWts1+33280*i14+1560*k49+24*s10+(ptrdiff_t)24));
__m512 sum9 = _mm512_set1_ps(*(float*)(arrangedWts1+33280*i14+1560*k49+24*s10+(ptrdiff_t)28));
__m512 sum13 = _mm512_set1_ps(*(float*)(arrangedWts1+33280*i14+1560*k49+24*s10+(ptrdiff_t)32));
__m512 sum17 = _mm512_set1_ps(*(float*)(arrangedWts1+33280*i14+1560*k49+24*s10+(ptrdiff_t)36));
__m512 sum21 = _mm512_set1_ps(*(float*)(arrangedWts1+33280*i14+1560*k49+24*s10+(ptrdiff_t)40));
__m512 sum25 = _mm512_set1_ps(*(float*)(arrangedWts1+33280*i14+1560*k49+24*s10+(ptrdiff_t)44));
__m512 sum6 = sum5;
__m512 sum7 = sum5;
__m512 sum8 = sum5;
__m512 sum10 = sum9;
__m512 sum11 = sum9;
__m512 sum12 = sum9;
__m512 sum14 = sum13;
__m512 sum15 = sum13;
__m512 sum16 = sum13;
__m512 sum18 = sum17;
__m512 sum19 = sum17;
__m512 sum20 = sum17;
__m512 sum22 = sum21;
__m512 sum23 = sum21;
__m512 sum24 = sum21;
__m512 sum26 = sum25;
__m512 sum27 = sum25;
__m512 sum28 = sum25;
for (s10 = 0; s10 < 64; ++s10) {
__m512 dat913 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s10+(ptrdiff_t)0);
__m512 dat914 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s10+(ptrdiff_t)64);
__m512 dat915 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s10+(ptrdiff_t)128);
__m512 dat916 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s10+(ptrdiff_t)192);
__m512 wt63 = _mm512_set1_ps(*(float*)(arrangedWts1+33280*i14+1560*k49+24*s10+(ptrdiff_t)24));
sum5 = _mm512_fmadd_ps(wt63, dat913, sum5);
sum6 = _mm512_fmadd_ps(wt63, dat914, sum6);
sum7 = _mm512_fmadd_ps(wt63, dat915, sum7);
sum8 = _mm512_fmadd_ps(wt63, dat916, sum8);
__m512 wt64 = _mm512_set1_ps(*(float*)(arrangedWts1+33280*i14+1560*k49+24*s10+(ptrdiff_t)28));
sum9 = _mm512_fmadd_ps(wt64, dat913, sum9);
sum10 = _mm512_fmadd_ps(wt64, dat914, sum10);
sum11 = _mm512_fmadd_ps(wt64, dat915, sum11);
sum12 = _mm512_fmadd_ps(wt64, dat916, sum12);
__m512 wt65 = _mm512_set1_ps(*(float*)(arrangedWts1+33280*i14+1560*k49+24*s10+(ptrdiff_t)32));
sum13 = _mm512_fmadd_ps(wt65, dat913, sum13);
sum14 = _mm512_fmadd_ps(wt65, dat914, sum14);
sum15 = _mm512_fmadd_ps(wt65, dat915, sum15);
sum16 = _mm512_fmadd_ps(wt65, dat916, sum16);
__m512 wt66 = _mm512_set1_ps(*(float*)(arrangedWts1+33280*i14+1560*k49+24*s10+(ptrdiff_t)36));
sum17 = _mm512_fmadd_ps(wt66, dat913, sum17);
sum18 = _mm512_fmadd_ps(wt66, dat914, sum18);
sum19 = _mm512_fmadd_ps(wt66, dat915, sum19);
sum20 = _mm512_fmadd_ps(wt66, dat916, sum20);
__m512 wt67 = _mm512_set1_ps(*(float*)(arrangedWts1+33280*i14+1560*k49+24*s10+(ptrdiff_t)40));
sum21 = _mm512_fmadd_ps(wt67, dat913, sum21);
sum22 = _mm512_fmadd_ps(wt67, dat914, sum22);
sum23 = _mm512_fmadd_ps(wt67, dat915, sum23);
sum24 = _mm512_fmadd_ps(wt67, dat916, sum24);
__m512 wt68 = _mm512_set1_ps(*(float*)(arrangedWts1+33280*i14+1560*k49+24*s10+(ptrdiff_t)44));
sum25 = _mm512_fmadd_ps(wt68, dat913, sum25);
sum26 = _mm512_fmadd_ps(wt68, dat914, sum26);
sum27 = _mm512_fmadd_ps(wt68, dat915, sum27);
sum28 = _mm512_fmadd_ps(wt68, dat916, sum28);
}
sum5 = _mm512_max_ps(_mm512_setzero_ps(), sum5);
sum6 = _mm512_max_ps(_mm512_setzero_ps(), sum6);
sum7 = _mm512_max_ps(_mm512_setzero_ps(), sum7);
sum8 = _mm512_max_ps(_mm512_setzero_ps(), sum8);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)0, 65535, sum5);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)64, 65535, sum6);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)128, 65535, sum7);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)192, 65535, sum8);
sum9 = _mm512_max_ps(_mm512_setzero_ps(), sum9);
sum10 = _mm512_max_ps(_mm512_setzero_ps(), sum10);
sum11 = _mm512_max_ps(_mm512_setzero_ps(), sum11);
sum12 = _mm512_max_ps(_mm512_setzero_ps(), sum12);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)12608, 65535, sum9);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)12672, 65535, sum10);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)12736, 65535, sum11);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)12800, 65535, sum12);
sum13 = _mm512_max_ps(_mm512_setzero_ps(), sum13);
sum14 = _mm512_max_ps(_mm512_setzero_ps(), sum14);
sum15 = _mm512_max_ps(_mm512_setzero_ps(), sum15);
sum16 = _mm512_max_ps(_mm512_setzero_ps(), sum16);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)25216, 65535, sum13);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)25280, 65535, sum14);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)25344, 65535, sum15);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)25408, 65535, sum16);
sum17 = _mm512_max_ps(_mm512_setzero_ps(), sum17);
sum18 = _mm512_max_ps(_mm512_setzero_ps(), sum18);
sum19 = _mm512_max_ps(_mm512_setzero_ps(), sum19);
sum20 = _mm512_max_ps(_mm512_setzero_ps(), sum20);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)37824, 65535, sum17);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)37888, 65535, sum18);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)37952, 65535, sum19);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)38016, 65535, sum20);
sum21 = _mm512_max_ps(_mm512_setzero_ps(), sum21);
sum22 = _mm512_max_ps(_mm512_setzero_ps(), sum22);
sum23 = _mm512_max_ps(_mm512_setzero_ps(), sum23);
sum24 = _mm512_max_ps(_mm512_setzero_ps(), sum24);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)50432, 65535, sum21);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)50496, 65535, sum22);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)50560, 65535, sum23);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)50624, 65535, sum24);
sum25 = _mm512_max_ps(_mm512_setzero_ps(), sum25);
sum26 = _mm512_max_ps(_mm512_setzero_ps(), sum26);
sum27 = _mm512_max_ps(_mm512_setzero_ps(), sum27);
sum28 = _mm512_max_ps(_mm512_setzero_ps(), sum28);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)63040, 65535, sum25);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)63104, 65535, sum26);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)63168, 65535, sum27);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)63232, 65535, sum28);
if (k49 >= kk25) return;
}
ptrdiff_t s11 = -1;
__m512 sum29 = _mm512_set1_ps(*(float*)(arrangedWts1+33280*i14+1560*k49+8*s11+(ptrdiff_t)8));
__m512 sum33 = _mm512_set1_ps(*(float*)(arrangedWts1+33280*i14+1560*k49+8*s11+(ptrdiff_t)12));
__m512 sum30 = sum29;
__m512 sum31 = sum29;
__m512 sum32 = sum29;
__m512 sum34 = sum33;
__m512 sum35 = sum33;
__m512 sum36 = sum33;
for (s11 = 0; s11 < 64; ++s11) {
__m512 dat917 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s11+(ptrdiff_t)0);
__m512 dat918 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s11+(ptrdiff_t)64);
__m512 dat919 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s11+(ptrdiff_t)128);
__m512 dat920 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s11+(ptrdiff_t)192);
__m512 wt69 = _mm512_set1_ps(*(float*)(arrangedWts1+33280*i14+1560*k49+8*s11+(ptrdiff_t)8));
sum29 = _mm512_fmadd_ps(wt69, dat917, sum29);
sum30 = _mm512_fmadd_ps(wt69, dat918, sum30);
sum31 = _mm512_fmadd_ps(wt69, dat919, sum31);
sum32 = _mm512_fmadd_ps(wt69, dat920, sum32);
__m512 wt70 = _mm512_set1_ps(*(float*)(arrangedWts1+33280*i14+1560*k49+8*s11+(ptrdiff_t)12));
sum33 = _mm512_fmadd_ps(wt70, dat917, sum33);
sum34 = _mm512_fmadd_ps(wt70, dat918, sum34);
sum35 = _mm512_fmadd_ps(wt70, dat919, sum35);
sum36 = _mm512_fmadd_ps(wt70, dat920, sum36);
}
sum29 = _mm512_max_ps(_mm512_setzero_ps(), sum29);
sum30 = _mm512_max_ps(_mm512_setzero_ps(), sum30);
sum31 = _mm512_max_ps(_mm512_setzero_ps(), sum31);
sum32 = _mm512_max_ps(_mm512_setzero_ps(), sum32);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)0, 65535, sum29);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)64, 65535, sum30);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)128, 65535, sum31);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)192, 65535, sum32);
sum33 = _mm512_max_ps(_mm512_setzero_ps(), sum33);
sum34 = _mm512_max_ps(_mm512_setzero_ps(), sum34);
sum35 = _mm512_max_ps(_mm512_setzero_ps(), sum35);
sum36 = _mm512_max_ps(_mm512_setzero_ps(), sum36);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)12608, 65535, sum33);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)12672, 65535, sum34);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)12736, 65535, sum35);
_mm512_mask_storeu_ps(datPtr4+1613824*i14+256*j9+75648*k49+(ptrdiff_t)12800, 65535, sum36);
if (j9 >= jj21) return;
}
}
}

static void DenseNet121OneApply1(DenseNet121ThreaderTeam1* team21, char** tensors15) {
void* pair1[] = {tensors15, 0};
DenseNet121ThreaderTask1 task19;
task19.callee1 = DenseNet121OneApply1Callee1;
task19.any1 = pair1;
task19.nd1 = 3;
task19.hull1[0] = 2;
task19.hull1[1] = 49;
task19.hull1[2] = 1;
DenseNet121ThreaderDo1(team21, &task19);
}

static void DenseNet121OneArrangeWts2Callee1(DenseNet121ThreaderTask1* task28, int64_t* pt19) {
char** tensors26 = task28->any1;
(void)pt19;
char*restrict wtPtr4 = tensors26[0]+(ptrdiff_t)3340*0+(ptrdiff_t)49152*0;
char*restrict biasPtr4 = tensors26[1]+(ptrdiff_t)512*0;
char*restrict bnPtr4 = tensors26[2]+(ptrdiff_t)8*128*0;
char*restrict arranged3 = tensors26[3]+(ptrdiff_t)428032*0+(ptrdiff_t)49664*0;
ptrdiff_t ii4 = 1;
for (ptrdiff_t i20 = 0; i20 < ii4; ++i20) {
ptrdiff_t j14 = 0;
ptrdiff_t jj23 = j14+8;
for (; j14 < jj23; ++j14) {
if (j14 < 7) {
ptrdiff_t k71 = 0+16*(j14-0);
ptrdiff_t l23 = (size_t)(0+k71)/6;
ptrdiff_t cut5 = (size_t)(0+k71)%6;
switch (cut5) {
case 0:;
case 2: {
__m512 sum78 = _mm512_maskz_loadu_ps(65535, biasPtr4+512*i20+4*k71);
__m512i pmMul5 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd5 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo4 = _mm512_loadu_ps(bnPtr4+(ptrdiff_t)8*(k71+128*i20));
__m512 masHi4 = _mm512_maskz_loadu_ps(65535, bnPtr4+(ptrdiff_t)8*(k71+128*i20)+(ptrdiff_t)64);
__m512 postMul8 = _mm512_permutex2var_ps(masLo4, pmMul5, masHi4);
__m512 postAdd6 = _mm512_permutex2var_ps(masLo4, pmAdd5, masHi4);
sum78 = _mm512_fmadd_ps(sum78, postMul8, postAdd6);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*0+(ptrdiff_t)0, 63>>cut5, sum78);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*0+(ptrdiff_t)2304, 4032>>cut5, sum78);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*0+(ptrdiff_t)4608, 65535-(4095>>cut5), sum78);
ptrdiff_t c11 = 0;
for (; c11 != 6; ++c11) {
__m512 wt91 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c11+(ptrdiff_t)0);
__m512 wt92 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c11+(ptrdiff_t)384);
__m512 wt93 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c11+(ptrdiff_t)768);
__m512 wt94 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c11+(ptrdiff_t)1152);
__m512 wt95 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c11+(ptrdiff_t)1536);
__m512 wt96 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c11+(ptrdiff_t)1920);
__m512 wt97 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c11+(ptrdiff_t)2304);
__m512 wt98 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c11+(ptrdiff_t)2688);
__m512 wt99 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c11+(ptrdiff_t)3072);
__m512 wt100 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c11+(ptrdiff_t)3456);
__m512 wt101 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c11+(ptrdiff_t)3840);
__m512 wt102 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c11+(ptrdiff_t)4224);
__m512 wt103 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c11+(ptrdiff_t)4608);
__m512 wt104 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c11+(ptrdiff_t)4992);
__m512 wt105 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c11+(ptrdiff_t)5376);
__m512 wt106 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c11+(ptrdiff_t)5760);
__m512 tmp5157 = _mm512_unpacklo_ps(wt91, wt92);
__m512 tmp5158 = _mm512_unpackhi_ps(wt91, wt92);
__m512 tmp5159 = _mm512_unpacklo_ps(wt93, wt94);
__m512 tmp5160 = _mm512_unpackhi_ps(wt93, wt94);
__m512 tmp5161 = _mm512_unpacklo_ps(wt95, wt96);
__m512 tmp5162 = _mm512_unpackhi_ps(wt95, wt96);
__m512 tmp5163 = _mm512_unpacklo_ps(wt97, wt98);
__m512 tmp5164 = _mm512_unpackhi_ps(wt97, wt98);
__m512 tmp5165 = _mm512_unpacklo_ps(wt99, wt100);
__m512 tmp5166 = _mm512_unpackhi_ps(wt99, wt100);
__m512 tmp5167 = _mm512_unpacklo_ps(wt101, wt102);
__m512 tmp5168 = _mm512_unpackhi_ps(wt101, wt102);
__m512 tmp5169 = _mm512_unpacklo_ps(wt103, wt104);
__m512 tmp5170 = _mm512_unpackhi_ps(wt103, wt104);
__m512 tmp5171 = _mm512_unpacklo_ps(wt105, wt106);
__m512 tmp5172 = _mm512_unpackhi_ps(wt105, wt106);
__m512 tmp5173 = _mm512_shuffle_ps(tmp5157, tmp5159, 68);
__m512 tmp5174 = _mm512_shuffle_ps(tmp5157, tmp5159, 238);
__m512 tmp5175 = _mm512_shuffle_ps(tmp5158, tmp5160, 68);
__m512 tmp5176 = _mm512_shuffle_ps(tmp5158, tmp5160, 238);
__m512 tmp5177 = _mm512_shuffle_ps(tmp5161, tmp5163, 68);
__m512 tmp5178 = _mm512_shuffle_ps(tmp5161, tmp5163, 238);
__m512 tmp5179 = _mm512_shuffle_ps(tmp5162, tmp5164, 68);
__m512 tmp5180 = _mm512_shuffle_ps(tmp5162, tmp5164, 238);
__m512 tmp5181 = _mm512_shuffle_ps(tmp5165, tmp5167, 68);
__m512 tmp5182 = _mm512_shuffle_ps(tmp5165, tmp5167, 238);
__m512 tmp5183 = _mm512_shuffle_ps(tmp5166, tmp5168, 68);
__m512 tmp5184 = _mm512_shuffle_ps(tmp5166, tmp5168, 238);
__m512 tmp5185 = _mm512_shuffle_ps(tmp5169, tmp5171, 68);
__m512 tmp5186 = _mm512_shuffle_ps(tmp5169, tmp5171, 238);
__m512 tmp5187 = _mm512_shuffle_ps(tmp5170, tmp5172, 68);
__m512 tmp5188 = _mm512_shuffle_ps(tmp5170, tmp5172, 238);
__m512 tmp5189 = _mm512_shuffle_f32x4(tmp5173, tmp5177, 136);
__m512 tmp5190 = _mm512_shuffle_f32x4(tmp5173, tmp5177, 221);
__m512 tmp5191 = _mm512_shuffle_f32x4(tmp5174, tmp5178, 136);
__m512 tmp5192 = _mm512_shuffle_f32x4(tmp5174, tmp5178, 221);
__m512 tmp5193 = _mm512_shuffle_f32x4(tmp5175, tmp5179, 136);
__m512 tmp5194 = _mm512_shuffle_f32x4(tmp5175, tmp5179, 221);
__m512 tmp5195 = _mm512_shuffle_f32x4(tmp5176, tmp5180, 136);
__m512 tmp5196 = _mm512_shuffle_f32x4(tmp5176, tmp5180, 221);
__m512 tmp5197 = _mm512_shuffle_f32x4(tmp5181, tmp5185, 136);
__m512 tmp5198 = _mm512_shuffle_f32x4(tmp5181, tmp5185, 221);
__m512 tmp5199 = _mm512_shuffle_f32x4(tmp5182, tmp5186, 136);
__m512 tmp5200 = _mm512_shuffle_f32x4(tmp5182, tmp5186, 221);
__m512 tmp5201 = _mm512_shuffle_f32x4(tmp5183, tmp5187, 136);
__m512 tmp5202 = _mm512_shuffle_f32x4(tmp5183, tmp5187, 221);
__m512 tmp5203 = _mm512_shuffle_f32x4(tmp5184, tmp5188, 136);
__m512 tmp5204 = _mm512_shuffle_f32x4(tmp5184, tmp5188, 221);
wt91 = _mm512_shuffle_f32x4(tmp5189, tmp5197, 136);
wt99 = _mm512_shuffle_f32x4(tmp5189, tmp5197, 221);
wt92 = _mm512_shuffle_f32x4(tmp5191, tmp5199, 136);
wt100 = _mm512_shuffle_f32x4(tmp5191, tmp5199, 221);
wt93 = _mm512_shuffle_f32x4(tmp5193, tmp5201, 136);
wt101 = _mm512_shuffle_f32x4(tmp5193, tmp5201, 221);
wt94 = _mm512_shuffle_f32x4(tmp5195, tmp5203, 136);
wt102 = _mm512_shuffle_f32x4(tmp5195, tmp5203, 221);
wt95 = _mm512_shuffle_f32x4(tmp5190, tmp5198, 136);
wt103 = _mm512_shuffle_f32x4(tmp5190, tmp5198, 221);
wt96 = _mm512_shuffle_f32x4(tmp5192, tmp5200, 136);
wt104 = _mm512_shuffle_f32x4(tmp5192, tmp5200, 221);
wt97 = _mm512_shuffle_f32x4(tmp5194, tmp5202, 136);
wt105 = _mm512_shuffle_f32x4(tmp5194, tmp5202, 221);
wt98 = _mm512_shuffle_f32x4(tmp5196, tmp5204, 136);
wt106 = _mm512_shuffle_f32x4(tmp5196, tmp5204, 221);
wt91 = _mm512_mul_ps(wt91, postMul8);
wt92 = _mm512_mul_ps(wt92, postMul8);
wt93 = _mm512_mul_ps(wt93, postMul8);
wt94 = _mm512_mul_ps(wt94, postMul8);
wt95 = _mm512_mul_ps(wt95, postMul8);
wt96 = _mm512_mul_ps(wt96, postMul8);
wt97 = _mm512_mul_ps(wt97, postMul8);
wt98 = _mm512_mul_ps(wt98, postMul8);
wt99 = _mm512_mul_ps(wt99, postMul8);
wt100 = _mm512_mul_ps(wt100, postMul8);
wt101 = _mm512_mul_ps(wt101, postMul8);
wt102 = _mm512_mul_ps(wt102, postMul8);
wt103 = _mm512_mul_ps(wt103, postMul8);
wt104 = _mm512_mul_ps(wt104, postMul8);
wt105 = _mm512_mul_ps(wt105, postMul8);
wt106 = _mm512_mul_ps(wt106, postMul8);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(1+16*c11)+(ptrdiff_t)0, 63>>cut5, wt91);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(2+16*c11)+(ptrdiff_t)0, 63>>cut5, wt92);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(3+16*c11)+(ptrdiff_t)0, 63>>cut5, wt93);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(4+16*c11)+(ptrdiff_t)0, 63>>cut5, wt94);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(5+16*c11)+(ptrdiff_t)0, 63>>cut5, wt95);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(6+16*c11)+(ptrdiff_t)0, 63>>cut5, wt96);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(7+16*c11)+(ptrdiff_t)0, 63>>cut5, wt97);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(8+16*c11)+(ptrdiff_t)0, 63>>cut5, wt98);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(9+16*c11)+(ptrdiff_t)0, 63>>cut5, wt99);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(10+16*c11)+(ptrdiff_t)0, 63>>cut5, wt100);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(11+16*c11)+(ptrdiff_t)0, 63>>cut5, wt101);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(12+16*c11)+(ptrdiff_t)0, 63>>cut5, wt102);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(13+16*c11)+(ptrdiff_t)0, 63>>cut5, wt103);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(14+16*c11)+(ptrdiff_t)0, 63>>cut5, wt104);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(15+16*c11)+(ptrdiff_t)0, 63>>cut5, wt105);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(16+16*c11)+(ptrdiff_t)0, 63>>cut5, wt106);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(1+16*c11)+(ptrdiff_t)2304, 4032>>cut5, wt91);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(2+16*c11)+(ptrdiff_t)2304, 4032>>cut5, wt92);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(3+16*c11)+(ptrdiff_t)2304, 4032>>cut5, wt93);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(4+16*c11)+(ptrdiff_t)2304, 4032>>cut5, wt94);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(5+16*c11)+(ptrdiff_t)2304, 4032>>cut5, wt95);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(6+16*c11)+(ptrdiff_t)2304, 4032>>cut5, wt96);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(7+16*c11)+(ptrdiff_t)2304, 4032>>cut5, wt97);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(8+16*c11)+(ptrdiff_t)2304, 4032>>cut5, wt98);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(9+16*c11)+(ptrdiff_t)2304, 4032>>cut5, wt99);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(10+16*c11)+(ptrdiff_t)2304, 4032>>cut5, wt100);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(11+16*c11)+(ptrdiff_t)2304, 4032>>cut5, wt101);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(12+16*c11)+(ptrdiff_t)2304, 4032>>cut5, wt102);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(13+16*c11)+(ptrdiff_t)2304, 4032>>cut5, wt103);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(14+16*c11)+(ptrdiff_t)2304, 4032>>cut5, wt104);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(15+16*c11)+(ptrdiff_t)2304, 4032>>cut5, wt105);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(16+16*c11)+(ptrdiff_t)2304, 4032>>cut5, wt106);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(1+16*c11)+(ptrdiff_t)4608, 65535-(4095>>cut5), wt91);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(2+16*c11)+(ptrdiff_t)4608, 65535-(4095>>cut5), wt92);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(3+16*c11)+(ptrdiff_t)4608, 65535-(4095>>cut5), wt93);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(4+16*c11)+(ptrdiff_t)4608, 65535-(4095>>cut5), wt94);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(5+16*c11)+(ptrdiff_t)4608, 65535-(4095>>cut5), wt95);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(6+16*c11)+(ptrdiff_t)4608, 65535-(4095>>cut5), wt96);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(7+16*c11)+(ptrdiff_t)4608, 65535-(4095>>cut5), wt97);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(8+16*c11)+(ptrdiff_t)4608, 65535-(4095>>cut5), wt98);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(9+16*c11)+(ptrdiff_t)4608, 65535-(4095>>cut5), wt99);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(10+16*c11)+(ptrdiff_t)4608, 65535-(4095>>cut5), wt100);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(11+16*c11)+(ptrdiff_t)4608, 65535-(4095>>cut5), wt101);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(12+16*c11)+(ptrdiff_t)4608, 65535-(4095>>cut5), wt102);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(13+16*c11)+(ptrdiff_t)4608, 65535-(4095>>cut5), wt103);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(14+16*c11)+(ptrdiff_t)4608, 65535-(4095>>cut5), wt104);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(15+16*c11)+(ptrdiff_t)4608, 65535-(4095>>cut5), wt105);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(16+16*c11)+(ptrdiff_t)4608, 65535-(4095>>cut5), wt106);
}
break;
}
default: {
cut5 = 4;
__m512 sum79 = _mm512_maskz_loadu_ps(65535, biasPtr4+512*i20+4*k71);
__m512i pmMul6 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd6 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo5 = _mm512_loadu_ps(bnPtr4+(ptrdiff_t)8*(k71+128*i20));
__m512 masHi5 = _mm512_maskz_loadu_ps(65535, bnPtr4+(ptrdiff_t)8*(k71+128*i20)+(ptrdiff_t)64);
__m512 postMul9 = _mm512_permutex2var_ps(masLo5, pmMul6, masHi5);
__m512 postAdd7 = _mm512_permutex2var_ps(masLo5, pmAdd6, masHi5);
sum79 = _mm512_fmadd_ps(sum79, postMul9, postAdd7);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*0+(ptrdiff_t)0, 63>>cut5, sum79);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*0+(ptrdiff_t)2304, 4032>>cut5, sum79);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*0+(ptrdiff_t)4608, 258048>>cut5, sum79);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*0+(ptrdiff_t)6912, 65535-(262143>>cut5), sum79);
ptrdiff_t c12 = 0;
for (; c12 != 6; ++c12) {
__m512 wt107 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c12+(ptrdiff_t)0);
__m512 wt108 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c12+(ptrdiff_t)384);
__m512 wt109 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c12+(ptrdiff_t)768);
__m512 wt110 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c12+(ptrdiff_t)1152);
__m512 wt111 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c12+(ptrdiff_t)1536);
__m512 wt112 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c12+(ptrdiff_t)1920);
__m512 wt113 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c12+(ptrdiff_t)2304);
__m512 wt114 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c12+(ptrdiff_t)2688);
__m512 wt115 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c12+(ptrdiff_t)3072);
__m512 wt116 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c12+(ptrdiff_t)3456);
__m512 wt117 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c12+(ptrdiff_t)3840);
__m512 wt118 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c12+(ptrdiff_t)4224);
__m512 wt119 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c12+(ptrdiff_t)4608);
__m512 wt120 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c12+(ptrdiff_t)4992);
__m512 wt121 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c12+(ptrdiff_t)5376);
__m512 wt122 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k71+64*c12+(ptrdiff_t)5760);
__m512 tmp5205 = _mm512_unpacklo_ps(wt107, wt108);
__m512 tmp5206 = _mm512_unpackhi_ps(wt107, wt108);
__m512 tmp5207 = _mm512_unpacklo_ps(wt109, wt110);
__m512 tmp5208 = _mm512_unpackhi_ps(wt109, wt110);
__m512 tmp5209 = _mm512_unpacklo_ps(wt111, wt112);
__m512 tmp5210 = _mm512_unpackhi_ps(wt111, wt112);
__m512 tmp5211 = _mm512_unpacklo_ps(wt113, wt114);
__m512 tmp5212 = _mm512_unpackhi_ps(wt113, wt114);
__m512 tmp5213 = _mm512_unpacklo_ps(wt115, wt116);
__m512 tmp5214 = _mm512_unpackhi_ps(wt115, wt116);
__m512 tmp5215 = _mm512_unpacklo_ps(wt117, wt118);
__m512 tmp5216 = _mm512_unpackhi_ps(wt117, wt118);
__m512 tmp5217 = _mm512_unpacklo_ps(wt119, wt120);
__m512 tmp5218 = _mm512_unpackhi_ps(wt119, wt120);
__m512 tmp5219 = _mm512_unpacklo_ps(wt121, wt122);
__m512 tmp5220 = _mm512_unpackhi_ps(wt121, wt122);
__m512 tmp5221 = _mm512_shuffle_ps(tmp5205, tmp5207, 68);
__m512 tmp5222 = _mm512_shuffle_ps(tmp5205, tmp5207, 238);
__m512 tmp5223 = _mm512_shuffle_ps(tmp5206, tmp5208, 68);
__m512 tmp5224 = _mm512_shuffle_ps(tmp5206, tmp5208, 238);
__m512 tmp5225 = _mm512_shuffle_ps(tmp5209, tmp5211, 68);
__m512 tmp5226 = _mm512_shuffle_ps(tmp5209, tmp5211, 238);
__m512 tmp5227 = _mm512_shuffle_ps(tmp5210, tmp5212, 68);
__m512 tmp5228 = _mm512_shuffle_ps(tmp5210, tmp5212, 238);
__m512 tmp5229 = _mm512_shuffle_ps(tmp5213, tmp5215, 68);
__m512 tmp5230 = _mm512_shuffle_ps(tmp5213, tmp5215, 238);
__m512 tmp5231 = _mm512_shuffle_ps(tmp5214, tmp5216, 68);
__m512 tmp5232 = _mm512_shuffle_ps(tmp5214, tmp5216, 238);
__m512 tmp5233 = _mm512_shuffle_ps(tmp5217, tmp5219, 68);
__m512 tmp5234 = _mm512_shuffle_ps(tmp5217, tmp5219, 238);
__m512 tmp5235 = _mm512_shuffle_ps(tmp5218, tmp5220, 68);
__m512 tmp5236 = _mm512_shuffle_ps(tmp5218, tmp5220, 238);
__m512 tmp5237 = _mm512_shuffle_f32x4(tmp5221, tmp5225, 136);
__m512 tmp5238 = _mm512_shuffle_f32x4(tmp5221, tmp5225, 221);
__m512 tmp5239 = _mm512_shuffle_f32x4(tmp5222, tmp5226, 136);
__m512 tmp5240 = _mm512_shuffle_f32x4(tmp5222, tmp5226, 221);
__m512 tmp5241 = _mm512_shuffle_f32x4(tmp5223, tmp5227, 136);
__m512 tmp5242 = _mm512_shuffle_f32x4(tmp5223, tmp5227, 221);
__m512 tmp5243 = _mm512_shuffle_f32x4(tmp5224, tmp5228, 136);
__m512 tmp5244 = _mm512_shuffle_f32x4(tmp5224, tmp5228, 221);
__m512 tmp5245 = _mm512_shuffle_f32x4(tmp5229, tmp5233, 136);
__m512 tmp5246 = _mm512_shuffle_f32x4(tmp5229, tmp5233, 221);
__m512 tmp5247 = _mm512_shuffle_f32x4(tmp5230, tmp5234, 136);
__m512 tmp5248 = _mm512_shuffle_f32x4(tmp5230, tmp5234, 221);
__m512 tmp5249 = _mm512_shuffle_f32x4(tmp5231, tmp5235, 136);
__m512 tmp5250 = _mm512_shuffle_f32x4(tmp5231, tmp5235, 221);
__m512 tmp5251 = _mm512_shuffle_f32x4(tmp5232, tmp5236, 136);
__m512 tmp5252 = _mm512_shuffle_f32x4(tmp5232, tmp5236, 221);
wt107 = _mm512_shuffle_f32x4(tmp5237, tmp5245, 136);
wt115 = _mm512_shuffle_f32x4(tmp5237, tmp5245, 221);
wt108 = _mm512_shuffle_f32x4(tmp5239, tmp5247, 136);
wt116 = _mm512_shuffle_f32x4(tmp5239, tmp5247, 221);
wt109 = _mm512_shuffle_f32x4(tmp5241, tmp5249, 136);
wt117 = _mm512_shuffle_f32x4(tmp5241, tmp5249, 221);
wt110 = _mm512_shuffle_f32x4(tmp5243, tmp5251, 136);
wt118 = _mm512_shuffle_f32x4(tmp5243, tmp5251, 221);
wt111 = _mm512_shuffle_f32x4(tmp5238, tmp5246, 136);
wt119 = _mm512_shuffle_f32x4(tmp5238, tmp5246, 221);
wt112 = _mm512_shuffle_f32x4(tmp5240, tmp5248, 136);
wt120 = _mm512_shuffle_f32x4(tmp5240, tmp5248, 221);
wt113 = _mm512_shuffle_f32x4(tmp5242, tmp5250, 136);
wt121 = _mm512_shuffle_f32x4(tmp5242, tmp5250, 221);
wt114 = _mm512_shuffle_f32x4(tmp5244, tmp5252, 136);
wt122 = _mm512_shuffle_f32x4(tmp5244, tmp5252, 221);
wt107 = _mm512_mul_ps(wt107, postMul9);
wt108 = _mm512_mul_ps(wt108, postMul9);
wt109 = _mm512_mul_ps(wt109, postMul9);
wt110 = _mm512_mul_ps(wt110, postMul9);
wt111 = _mm512_mul_ps(wt111, postMul9);
wt112 = _mm512_mul_ps(wt112, postMul9);
wt113 = _mm512_mul_ps(wt113, postMul9);
wt114 = _mm512_mul_ps(wt114, postMul9);
wt115 = _mm512_mul_ps(wt115, postMul9);
wt116 = _mm512_mul_ps(wt116, postMul9);
wt117 = _mm512_mul_ps(wt117, postMul9);
wt118 = _mm512_mul_ps(wt118, postMul9);
wt119 = _mm512_mul_ps(wt119, postMul9);
wt120 = _mm512_mul_ps(wt120, postMul9);
wt121 = _mm512_mul_ps(wt121, postMul9);
wt122 = _mm512_mul_ps(wt122, postMul9);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(1+16*c12)+(ptrdiff_t)0, 63>>cut5, wt107);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(2+16*c12)+(ptrdiff_t)0, 63>>cut5, wt108);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(3+16*c12)+(ptrdiff_t)0, 63>>cut5, wt109);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(4+16*c12)+(ptrdiff_t)0, 63>>cut5, wt110);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(5+16*c12)+(ptrdiff_t)0, 63>>cut5, wt111);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(6+16*c12)+(ptrdiff_t)0, 63>>cut5, wt112);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(7+16*c12)+(ptrdiff_t)0, 63>>cut5, wt113);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(8+16*c12)+(ptrdiff_t)0, 63>>cut5, wt114);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(9+16*c12)+(ptrdiff_t)0, 63>>cut5, wt115);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(10+16*c12)+(ptrdiff_t)0, 63>>cut5, wt116);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(11+16*c12)+(ptrdiff_t)0, 63>>cut5, wt117);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(12+16*c12)+(ptrdiff_t)0, 63>>cut5, wt118);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(13+16*c12)+(ptrdiff_t)0, 63>>cut5, wt119);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(14+16*c12)+(ptrdiff_t)0, 63>>cut5, wt120);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(15+16*c12)+(ptrdiff_t)0, 63>>cut5, wt121);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(16+16*c12)+(ptrdiff_t)0, 63>>cut5, wt122);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(1+16*c12)+(ptrdiff_t)2304, 4032>>cut5, wt107);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(2+16*c12)+(ptrdiff_t)2304, 4032>>cut5, wt108);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(3+16*c12)+(ptrdiff_t)2304, 4032>>cut5, wt109);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(4+16*c12)+(ptrdiff_t)2304, 4032>>cut5, wt110);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(5+16*c12)+(ptrdiff_t)2304, 4032>>cut5, wt111);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(6+16*c12)+(ptrdiff_t)2304, 4032>>cut5, wt112);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(7+16*c12)+(ptrdiff_t)2304, 4032>>cut5, wt113);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(8+16*c12)+(ptrdiff_t)2304, 4032>>cut5, wt114);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(9+16*c12)+(ptrdiff_t)2304, 4032>>cut5, wt115);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(10+16*c12)+(ptrdiff_t)2304, 4032>>cut5, wt116);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(11+16*c12)+(ptrdiff_t)2304, 4032>>cut5, wt117);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(12+16*c12)+(ptrdiff_t)2304, 4032>>cut5, wt118);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(13+16*c12)+(ptrdiff_t)2304, 4032>>cut5, wt119);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(14+16*c12)+(ptrdiff_t)2304, 4032>>cut5, wt120);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(15+16*c12)+(ptrdiff_t)2304, 4032>>cut5, wt121);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(16+16*c12)+(ptrdiff_t)2304, 4032>>cut5, wt122);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(1+16*c12)+(ptrdiff_t)4608, 258048>>cut5, wt107);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(2+16*c12)+(ptrdiff_t)4608, 258048>>cut5, wt108);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(3+16*c12)+(ptrdiff_t)4608, 258048>>cut5, wt109);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(4+16*c12)+(ptrdiff_t)4608, 258048>>cut5, wt110);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(5+16*c12)+(ptrdiff_t)4608, 258048>>cut5, wt111);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(6+16*c12)+(ptrdiff_t)4608, 258048>>cut5, wt112);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(7+16*c12)+(ptrdiff_t)4608, 258048>>cut5, wt113);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(8+16*c12)+(ptrdiff_t)4608, 258048>>cut5, wt114);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(9+16*c12)+(ptrdiff_t)4608, 258048>>cut5, wt115);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(10+16*c12)+(ptrdiff_t)4608, 258048>>cut5, wt116);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(11+16*c12)+(ptrdiff_t)4608, 258048>>cut5, wt117);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(12+16*c12)+(ptrdiff_t)4608, 258048>>cut5, wt118);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(13+16*c12)+(ptrdiff_t)4608, 258048>>cut5, wt119);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(14+16*c12)+(ptrdiff_t)4608, 258048>>cut5, wt120);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(15+16*c12)+(ptrdiff_t)4608, 258048>>cut5, wt121);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(16+16*c12)+(ptrdiff_t)4608, 258048>>cut5, wt122);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(1+16*c12)+(ptrdiff_t)6912, 65535-(262143>>cut5), wt107);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(2+16*c12)+(ptrdiff_t)6912, 65535-(262143>>cut5), wt108);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(3+16*c12)+(ptrdiff_t)6912, 65535-(262143>>cut5), wt109);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(4+16*c12)+(ptrdiff_t)6912, 65535-(262143>>cut5), wt110);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(5+16*c12)+(ptrdiff_t)6912, 65535-(262143>>cut5), wt111);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(6+16*c12)+(ptrdiff_t)6912, 65535-(262143>>cut5), wt112);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(7+16*c12)+(ptrdiff_t)6912, 65535-(262143>>cut5), wt113);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(8+16*c12)+(ptrdiff_t)6912, 65535-(262143>>cut5), wt114);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(9+16*c12)+(ptrdiff_t)6912, 65535-(262143>>cut5), wt115);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(10+16*c12)+(ptrdiff_t)6912, 65535-(262143>>cut5), wt116);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(11+16*c12)+(ptrdiff_t)6912, 65535-(262143>>cut5), wt117);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(12+16*c12)+(ptrdiff_t)6912, 65535-(262143>>cut5), wt118);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(13+16*c12)+(ptrdiff_t)6912, 65535-(262143>>cut5), wt119);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(14+16*c12)+(ptrdiff_t)6912, 65535-(262143>>cut5), wt120);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(15+16*c12)+(ptrdiff_t)6912, 65535-(262143>>cut5), wt121);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l23+4*cut5+24*(16+16*c12)+(ptrdiff_t)6912, 65535-(262143>>cut5), wt122);
}
}
}
} else {
ptrdiff_t k70 = 112;
ptrdiff_t l22 = (size_t)(0+k70)/6;
ptrdiff_t cut4 = (size_t)(0+k70)%6;
__m512 sum77 = _mm512_maskz_loadu_ps(65535, biasPtr4+512*i20+4*k70);
__m512i pmMul7 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd7 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo6 = _mm512_loadu_ps(bnPtr4+(ptrdiff_t)8*(k70+128*i20));
__m512 masHi6 = _mm512_maskz_loadu_ps(65535, bnPtr4+(ptrdiff_t)8*(k70+128*i20)+(ptrdiff_t)64);
__m512 postMul7 = _mm512_permutex2var_ps(masLo6, pmMul7, masHi6);
__m512 postAdd5 = _mm512_permutex2var_ps(masLo6, pmAdd7, masHi6);
sum77 = _mm512_fmadd_ps(sum77, postMul7, postAdd5);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*0+(ptrdiff_t)0, 63>>cut4, sum77);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*0+(ptrdiff_t)2304, 4032>>cut4, sum77);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*0+(ptrdiff_t)4608, 258048>>cut4, sum77);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+8*0+(ptrdiff_t)6912, 65535-(262143>>cut4), sum77);
ptrdiff_t c10 = 0;
for (; c10 != 6; ++c10) {
__m512 wt75 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k70+64*c10+(ptrdiff_t)0);
__m512 wt76 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k70+64*c10+(ptrdiff_t)384);
__m512 wt77 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k70+64*c10+(ptrdiff_t)768);
__m512 wt78 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k70+64*c10+(ptrdiff_t)1152);
__m512 wt79 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k70+64*c10+(ptrdiff_t)1536);
__m512 wt80 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k70+64*c10+(ptrdiff_t)1920);
__m512 wt81 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k70+64*c10+(ptrdiff_t)2304);
__m512 wt82 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k70+64*c10+(ptrdiff_t)2688);
__m512 wt83 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k70+64*c10+(ptrdiff_t)3072);
__m512 wt84 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k70+64*c10+(ptrdiff_t)3456);
__m512 wt85 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k70+64*c10+(ptrdiff_t)3840);
__m512 wt86 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k70+64*c10+(ptrdiff_t)4224);
__m512 wt87 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k70+64*c10+(ptrdiff_t)4608);
__m512 wt88 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k70+64*c10+(ptrdiff_t)4992);
__m512 wt89 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k70+64*c10+(ptrdiff_t)5376);
__m512 wt90 = _mm512_maskz_loadu_ps(65535, wtPtr4+49152*i20+384*k70+64*c10+(ptrdiff_t)5760);
__m512 tmp5253 = _mm512_unpacklo_ps(wt75, wt76);
__m512 tmp5254 = _mm512_unpackhi_ps(wt75, wt76);
__m512 tmp5255 = _mm512_unpacklo_ps(wt77, wt78);
__m512 tmp5256 = _mm512_unpackhi_ps(wt77, wt78);
__m512 tmp5257 = _mm512_unpacklo_ps(wt79, wt80);
__m512 tmp5258 = _mm512_unpackhi_ps(wt79, wt80);
__m512 tmp5259 = _mm512_unpacklo_ps(wt81, wt82);
__m512 tmp5260 = _mm512_unpackhi_ps(wt81, wt82);
__m512 tmp5261 = _mm512_unpacklo_ps(wt83, wt84);
__m512 tmp5262 = _mm512_unpackhi_ps(wt83, wt84);
__m512 tmp5263 = _mm512_unpacklo_ps(wt85, wt86);
__m512 tmp5264 = _mm512_unpackhi_ps(wt85, wt86);
__m512 tmp5265 = _mm512_unpacklo_ps(wt87, wt88);
__m512 tmp5266 = _mm512_unpackhi_ps(wt87, wt88);
__m512 tmp5267 = _mm512_unpacklo_ps(wt89, wt90);
__m512 tmp5268 = _mm512_unpackhi_ps(wt89, wt90);
__m512 tmp5269 = _mm512_shuffle_ps(tmp5253, tmp5255, 68);
__m512 tmp5270 = _mm512_shuffle_ps(tmp5253, tmp5255, 238);
__m512 tmp5271 = _mm512_shuffle_ps(tmp5254, tmp5256, 68);
__m512 tmp5272 = _mm512_shuffle_ps(tmp5254, tmp5256, 238);
__m512 tmp5273 = _mm512_shuffle_ps(tmp5257, tmp5259, 68);
__m512 tmp5274 = _mm512_shuffle_ps(tmp5257, tmp5259, 238);
__m512 tmp5275 = _mm512_shuffle_ps(tmp5258, tmp5260, 68);
__m512 tmp5276 = _mm512_shuffle_ps(tmp5258, tmp5260, 238);
__m512 tmp5277 = _mm512_shuffle_ps(tmp5261, tmp5263, 68);
__m512 tmp5278 = _mm512_shuffle_ps(tmp5261, tmp5263, 238);
__m512 tmp5279 = _mm512_shuffle_ps(tmp5262, tmp5264, 68);
__m512 tmp5280 = _mm512_shuffle_ps(tmp5262, tmp5264, 238);
__m512 tmp5281 = _mm512_shuffle_ps(tmp5265, tmp5267, 68);
__m512 tmp5282 = _mm512_shuffle_ps(tmp5265, tmp5267, 238);
__m512 tmp5283 = _mm512_shuffle_ps(tmp5266, tmp5268, 68);
__m512 tmp5284 = _mm512_shuffle_ps(tmp5266, tmp5268, 238);
__m512 tmp5285 = _mm512_shuffle_f32x4(tmp5269, tmp5273, 136);
__m512 tmp5286 = _mm512_shuffle_f32x4(tmp5269, tmp5273, 221);
__m512 tmp5287 = _mm512_shuffle_f32x4(tmp5270, tmp5274, 136);
__m512 tmp5288 = _mm512_shuffle_f32x4(tmp5270, tmp5274, 221);
__m512 tmp5289 = _mm512_shuffle_f32x4(tmp5271, tmp5275, 136);
__m512 tmp5290 = _mm512_shuffle_f32x4(tmp5271, tmp5275, 221);
__m512 tmp5291 = _mm512_shuffle_f32x4(tmp5272, tmp5276, 136);
__m512 tmp5292 = _mm512_shuffle_f32x4(tmp5272, tmp5276, 221);
__m512 tmp5293 = _mm512_shuffle_f32x4(tmp5277, tmp5281, 136);
__m512 tmp5294 = _mm512_shuffle_f32x4(tmp5277, tmp5281, 221);
__m512 tmp5295 = _mm512_shuffle_f32x4(tmp5278, tmp5282, 136);
__m512 tmp5296 = _mm512_shuffle_f32x4(tmp5278, tmp5282, 221);
__m512 tmp5297 = _mm512_shuffle_f32x4(tmp5279, tmp5283, 136);
__m512 tmp5298 = _mm512_shuffle_f32x4(tmp5279, tmp5283, 221);
__m512 tmp5299 = _mm512_shuffle_f32x4(tmp5280, tmp5284, 136);
__m512 tmp5300 = _mm512_shuffle_f32x4(tmp5280, tmp5284, 221);
wt75 = _mm512_shuffle_f32x4(tmp5285, tmp5293, 136);
wt83 = _mm512_shuffle_f32x4(tmp5285, tmp5293, 221);
wt76 = _mm512_shuffle_f32x4(tmp5287, tmp5295, 136);
wt84 = _mm512_shuffle_f32x4(tmp5287, tmp5295, 221);
wt77 = _mm512_shuffle_f32x4(tmp5289, tmp5297, 136);
wt85 = _mm512_shuffle_f32x4(tmp5289, tmp5297, 221);
wt78 = _mm512_shuffle_f32x4(tmp5291, tmp5299, 136);
wt86 = _mm512_shuffle_f32x4(tmp5291, tmp5299, 221);
wt79 = _mm512_shuffle_f32x4(tmp5286, tmp5294, 136);
wt87 = _mm512_shuffle_f32x4(tmp5286, tmp5294, 221);
wt80 = _mm512_shuffle_f32x4(tmp5288, tmp5296, 136);
wt88 = _mm512_shuffle_f32x4(tmp5288, tmp5296, 221);
wt81 = _mm512_shuffle_f32x4(tmp5290, tmp5298, 136);
wt89 = _mm512_shuffle_f32x4(tmp5290, tmp5298, 221);
wt82 = _mm512_shuffle_f32x4(tmp5292, tmp5300, 136);
wt90 = _mm512_shuffle_f32x4(tmp5292, tmp5300, 221);
wt75 = _mm512_mul_ps(wt75, postMul7);
wt76 = _mm512_mul_ps(wt76, postMul7);
wt77 = _mm512_mul_ps(wt77, postMul7);
wt78 = _mm512_mul_ps(wt78, postMul7);
wt79 = _mm512_mul_ps(wt79, postMul7);
wt80 = _mm512_mul_ps(wt80, postMul7);
wt81 = _mm512_mul_ps(wt81, postMul7);
wt82 = _mm512_mul_ps(wt82, postMul7);
wt83 = _mm512_mul_ps(wt83, postMul7);
wt84 = _mm512_mul_ps(wt84, postMul7);
wt85 = _mm512_mul_ps(wt85, postMul7);
wt86 = _mm512_mul_ps(wt86, postMul7);
wt87 = _mm512_mul_ps(wt87, postMul7);
wt88 = _mm512_mul_ps(wt88, postMul7);
wt89 = _mm512_mul_ps(wt89, postMul7);
wt90 = _mm512_mul_ps(wt90, postMul7);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(1+16*c10)+(ptrdiff_t)0, 63>>cut4, wt75);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(2+16*c10)+(ptrdiff_t)0, 63>>cut4, wt76);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(3+16*c10)+(ptrdiff_t)0, 63>>cut4, wt77);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(4+16*c10)+(ptrdiff_t)0, 63>>cut4, wt78);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(5+16*c10)+(ptrdiff_t)0, 63>>cut4, wt79);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(6+16*c10)+(ptrdiff_t)0, 63>>cut4, wt80);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(7+16*c10)+(ptrdiff_t)0, 63>>cut4, wt81);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(8+16*c10)+(ptrdiff_t)0, 63>>cut4, wt82);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(9+16*c10)+(ptrdiff_t)0, 63>>cut4, wt83);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(10+16*c10)+(ptrdiff_t)0, 63>>cut4, wt84);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(11+16*c10)+(ptrdiff_t)0, 63>>cut4, wt85);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(12+16*c10)+(ptrdiff_t)0, 63>>cut4, wt86);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(13+16*c10)+(ptrdiff_t)0, 63>>cut4, wt87);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(14+16*c10)+(ptrdiff_t)0, 63>>cut4, wt88);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(15+16*c10)+(ptrdiff_t)0, 63>>cut4, wt89);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(16+16*c10)+(ptrdiff_t)0, 63>>cut4, wt90);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(1+16*c10)+(ptrdiff_t)2304, 4032>>cut4, wt75);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(2+16*c10)+(ptrdiff_t)2304, 4032>>cut4, wt76);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(3+16*c10)+(ptrdiff_t)2304, 4032>>cut4, wt77);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(4+16*c10)+(ptrdiff_t)2304, 4032>>cut4, wt78);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(5+16*c10)+(ptrdiff_t)2304, 4032>>cut4, wt79);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(6+16*c10)+(ptrdiff_t)2304, 4032>>cut4, wt80);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(7+16*c10)+(ptrdiff_t)2304, 4032>>cut4, wt81);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(8+16*c10)+(ptrdiff_t)2304, 4032>>cut4, wt82);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(9+16*c10)+(ptrdiff_t)2304, 4032>>cut4, wt83);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(10+16*c10)+(ptrdiff_t)2304, 4032>>cut4, wt84);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(11+16*c10)+(ptrdiff_t)2304, 4032>>cut4, wt85);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(12+16*c10)+(ptrdiff_t)2304, 4032>>cut4, wt86);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(13+16*c10)+(ptrdiff_t)2304, 4032>>cut4, wt87);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(14+16*c10)+(ptrdiff_t)2304, 4032>>cut4, wt88);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(15+16*c10)+(ptrdiff_t)2304, 4032>>cut4, wt89);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(16+16*c10)+(ptrdiff_t)2304, 4032>>cut4, wt90);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(1+16*c10)+(ptrdiff_t)4608, 258048>>cut4, wt75);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(2+16*c10)+(ptrdiff_t)4608, 258048>>cut4, wt76);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(3+16*c10)+(ptrdiff_t)4608, 258048>>cut4, wt77);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(4+16*c10)+(ptrdiff_t)4608, 258048>>cut4, wt78);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(5+16*c10)+(ptrdiff_t)4608, 258048>>cut4, wt79);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(6+16*c10)+(ptrdiff_t)4608, 258048>>cut4, wt80);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(7+16*c10)+(ptrdiff_t)4608, 258048>>cut4, wt81);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(8+16*c10)+(ptrdiff_t)4608, 258048>>cut4, wt82);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(9+16*c10)+(ptrdiff_t)4608, 258048>>cut4, wt83);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(10+16*c10)+(ptrdiff_t)4608, 258048>>cut4, wt84);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(11+16*c10)+(ptrdiff_t)4608, 258048>>cut4, wt85);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(12+16*c10)+(ptrdiff_t)4608, 258048>>cut4, wt86);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(13+16*c10)+(ptrdiff_t)4608, 258048>>cut4, wt87);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(14+16*c10)+(ptrdiff_t)4608, 258048>>cut4, wt88);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(15+16*c10)+(ptrdiff_t)4608, 258048>>cut4, wt89);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+24*(16+16*c10)+(ptrdiff_t)4608, 258048>>cut4, wt90);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+8*(1+16*c10)+(ptrdiff_t)6912, 65535-(262143>>cut4), wt75);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+8*(2+16*c10)+(ptrdiff_t)6912, 65535-(262143>>cut4), wt76);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+8*(3+16*c10)+(ptrdiff_t)6912, 65535-(262143>>cut4), wt77);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+8*(4+16*c10)+(ptrdiff_t)6912, 65535-(262143>>cut4), wt78);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+8*(5+16*c10)+(ptrdiff_t)6912, 65535-(262143>>cut4), wt79);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+8*(6+16*c10)+(ptrdiff_t)6912, 65535-(262143>>cut4), wt80);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+8*(7+16*c10)+(ptrdiff_t)6912, 65535-(262143>>cut4), wt81);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+8*(8+16*c10)+(ptrdiff_t)6912, 65535-(262143>>cut4), wt82);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+8*(9+16*c10)+(ptrdiff_t)6912, 65535-(262143>>cut4), wt83);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+8*(10+16*c10)+(ptrdiff_t)6912, 65535-(262143>>cut4), wt84);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+8*(11+16*c10)+(ptrdiff_t)6912, 65535-(262143>>cut4), wt85);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+8*(12+16*c10)+(ptrdiff_t)6912, 65535-(262143>>cut4), wt86);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+8*(13+16*c10)+(ptrdiff_t)6912, 65535-(262143>>cut4), wt87);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+8*(14+16*c10)+(ptrdiff_t)6912, 65535-(262143>>cut4), wt88);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+8*(15+16*c10)+(ptrdiff_t)6912, 65535-(262143>>cut4), wt89);
_mm512_mask_storeu_ps(arranged3+49664*i20+2328*l22+4*cut4+8*(16+16*c10)+(ptrdiff_t)6912, 65535-(262143>>cut4), wt90);
}
}
}
}
}

static void DenseNet121OneArrangeWts2(DenseNet121ThreaderTeam1* team26, char** tensors25) {
DenseNet121ThreaderTask1 task29;
task29.callee1 = DenseNet121OneArrangeWts2Callee1;
task29.any1 = tensors25;
task29.nd1 = 3;
task29.hull1[0] = 1;
task29.hull1[1] = 1;
task29.hull1[2] = 1;
DenseNet121ThreaderDo1(team26, &task29);
}

static void DenseNet121OneArrangeDats2Callee1(DenseNet121ThreaderTask1* task30, int64_t* pt20) {
char** tensors28 = task30->any1;
ptrdiff_t c13 = pt20[1];
char*restrict datPtr7 = tensors28[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)1210368*0;
char*restrict bnPtr5 = tensors28[1]+(ptrdiff_t)8*((ptrdiff_t)835*0+(ptrdiff_t)96*0);
char*restrict arranged4 = tensors28[2]+(ptrdiff_t)10474240*0+(ptrdiff_t)1204224*0;
ptrdiff_t ii5 = 1;
for (ptrdiff_t i21 = 0; i21 < ii5; ++i21) {
ptrdiff_t j15 = 2*c13;
ptrdiff_t jj24 = j15+(c13 < 23 ? 1 : 2);
for (; j15 != 49; ++j15) {
ptrdiff_t k72 = 0;
ptrdiff_t kk27 = k72+96;
for (; k72 < kk27; ++k72) {
__m512 dat1271 = _mm512_maskz_loadu_ps(65535, datPtr7+1210368*i21+256*j15+12608*k72+(ptrdiff_t)0);
__m512 dat1272 = _mm512_maskz_loadu_ps(65535, datPtr7+1210368*i21+256*j15+12608*k72+(ptrdiff_t)64);
__m512 dat1273 = _mm512_maskz_loadu_ps(65535, datPtr7+1210368*i21+256*j15+12608*k72+(ptrdiff_t)128);
__m512 dat1274 = _mm512_maskz_loadu_ps(65535, datPtr7+1210368*i21+256*j15+12608*k72+(ptrdiff_t)192);
__m512 bnMul2 = _mm512_set1_ps(((float*)bnPtr5+(ptrdiff_t)2*(k72+96*i21))[0]);
__m512 bnAdd2 = _mm512_set1_ps(((float*)bnPtr5+(ptrdiff_t)2*(k72+96*i21))[1]);
dat1271 = _mm512_fmadd_ps(dat1271, bnMul2, bnAdd2);
dat1272 = _mm512_fmadd_ps(dat1272, bnMul2, bnAdd2);
dat1273 = _mm512_fmadd_ps(dat1273, bnMul2, bnAdd2);
dat1274 = _mm512_fmadd_ps(dat1274, bnMul2, bnAdd2);
dat1271 = _mm512_max_ps(_mm512_setzero_ps(), dat1271);
dat1272 = _mm512_max_ps(_mm512_setzero_ps(), dat1272);
dat1273 = _mm512_max_ps(_mm512_setzero_ps(), dat1273);
dat1274 = _mm512_max_ps(_mm512_setzero_ps(), dat1274);
_mm512_mask_storeu_ps(arranged4+1204224*i21+24576*j15+256*k72+(ptrdiff_t)0, 65535, dat1271);
_mm512_mask_storeu_ps(arranged4+1204224*i21+24576*j15+256*k72+(ptrdiff_t)64, 65535, dat1272);
_mm512_mask_storeu_ps(arranged4+1204224*i21+24576*j15+256*k72+(ptrdiff_t)128, 65535, dat1273);
_mm512_mask_storeu_ps(arranged4+1204224*i21+24576*j15+256*k72+(ptrdiff_t)192, 65535, dat1274);
}
if (j15 >= jj24) goto next2;
}
next2:;
}
}

static void DenseNet121OneArrangeDats2(DenseNet121ThreaderTeam1* team27, char** tensors27) {
DenseNet121ThreaderTask1 task31;
task31.callee1 = DenseNet121OneArrangeDats2Callee1;
task31.any1 = tensors27;
task31.nd1 = 4;
task31.hull1[0] = 1;
task31.hull1[1] = 24;
task31.hull1[2] = 1;
task31.hull1[3] = 1;
DenseNet121ThreaderDo1(team27, &task31);
}

static void DenseNet121OneApply2Callee1(DenseNet121ThreaderTask1* task32, int64_t* pt21) {
void** pair6 = task32->any1;
char** tensors30 = pair6[0];
ptrdiff_t e10 = 0;
ptrdiff_t g11 = 0;
ptrdiff_t d6 = pt21[1];
ptrdiff_t w34 = pt21[0];
char*restrict arrangedWts2 = tensors30[0]+428032*e10+(ptrdiff_t)49664*1*g11;
char*restrict arrangedDats2 = tensors30[1]+10474240*e10+(ptrdiff_t)1204224*1*g11;
char*restrict datPtr8 = tensors30[2]+(ptrdiff_t)1613824*1*g11;
ptrdiff_t ii6 = 1;
for (ptrdiff_t i22 = 0; i22 < ii6; ++i22) {
ptrdiff_t j16 = 1*d6;
ptrdiff_t jj25 = j16+0;
for (; j16 != 49; ++j16) {
ptrdiff_t k73 = 7*w34;
ptrdiff_t kk28 = k73+(w34 < 2 ? 6 : 7);
for (; k73 != 21; ++k73) {
ptrdiff_t s14 = -1;
__m512 sum80 = _mm512_set1_ps(*(float*)(arrangedWts2+49664*i22+2328*k73+24*s14+(ptrdiff_t)24));
__m512 sum84 = _mm512_set1_ps(*(float*)(arrangedWts2+49664*i22+2328*k73+24*s14+(ptrdiff_t)28));
__m512 sum88 = _mm512_set1_ps(*(float*)(arrangedWts2+49664*i22+2328*k73+24*s14+(ptrdiff_t)32));
__m512 sum92 = _mm512_set1_ps(*(float*)(arrangedWts2+49664*i22+2328*k73+24*s14+(ptrdiff_t)36));
__m512 sum96 = _mm512_set1_ps(*(float*)(arrangedWts2+49664*i22+2328*k73+24*s14+(ptrdiff_t)40));
__m512 sum100 = _mm512_set1_ps(*(float*)(arrangedWts2+49664*i22+2328*k73+24*s14+(ptrdiff_t)44));
__m512 sum81 = sum80;
__m512 sum82 = sum80;
__m512 sum83 = sum80;
__m512 sum85 = sum84;
__m512 sum86 = sum84;
__m512 sum87 = sum84;
__m512 sum89 = sum88;
__m512 sum90 = sum88;
__m512 sum91 = sum88;
__m512 sum93 = sum92;
__m512 sum94 = sum92;
__m512 sum95 = sum92;
__m512 sum97 = sum96;
__m512 sum98 = sum96;
__m512 sum99 = sum96;
__m512 sum101 = sum100;
__m512 sum102 = sum100;
__m512 sum103 = sum100;
for (s14 = 0; s14 < 96; ++s14) {
__m512 dat1275 = _mm512_loadu_ps(arrangedDats2+1204224*i22+24576*j16+256*s14+(ptrdiff_t)0);
__m512 dat1276 = _mm512_loadu_ps(arrangedDats2+1204224*i22+24576*j16+256*s14+(ptrdiff_t)64);
__m512 dat1277 = _mm512_loadu_ps(arrangedDats2+1204224*i22+24576*j16+256*s14+(ptrdiff_t)128);
__m512 dat1278 = _mm512_loadu_ps(arrangedDats2+1204224*i22+24576*j16+256*s14+(ptrdiff_t)192);
__m512 wt123 = _mm512_set1_ps(*(float*)(arrangedWts2+49664*i22+2328*k73+24*s14+(ptrdiff_t)24));
sum80 = _mm512_fmadd_ps(wt123, dat1275, sum80);
sum81 = _mm512_fmadd_ps(wt123, dat1276, sum81);
sum82 = _mm512_fmadd_ps(wt123, dat1277, sum82);
sum83 = _mm512_fmadd_ps(wt123, dat1278, sum83);
__m512 wt124 = _mm512_set1_ps(*(float*)(arrangedWts2+49664*i22+2328*k73+24*s14+(ptrdiff_t)28));
sum84 = _mm512_fmadd_ps(wt124, dat1275, sum84);
sum85 = _mm512_fmadd_ps(wt124, dat1276, sum85);
sum86 = _mm512_fmadd_ps(wt124, dat1277, sum86);
sum87 = _mm512_fmadd_ps(wt124, dat1278, sum87);
__m512 wt125 = _mm512_set1_ps(*(float*)(arrangedWts2+49664*i22+2328*k73+24*s14+(ptrdiff_t)32));
sum88 = _mm512_fmadd_ps(wt125, dat1275, sum88);
sum89 = _mm512_fmadd_ps(wt125, dat1276, sum89);
sum90 = _mm512_fmadd_ps(wt125, dat1277, sum90);
sum91 = _mm512_fmadd_ps(wt125, dat1278, sum91);
__m512 wt126 = _mm512_set1_ps(*(float*)(arrangedWts2+49664*i22+2328*k73+24*s14+(ptrdiff_t)36));
sum92 = _mm512_fmadd_ps(wt126, dat1275, sum92);
sum93 = _mm512_fmadd_ps(wt126, dat1276, sum93);
sum94 = _mm512_fmadd_ps(wt126, dat1277, sum94);
sum95 = _mm512_fmadd_ps(wt126, dat1278, sum95);
__m512 wt127 = _mm512_set1_ps(*(float*)(arrangedWts2+49664*i22+2328*k73+24*s14+(ptrdiff_t)40));
sum96 = _mm512_fmadd_ps(wt127, dat1275, sum96);
sum97 = _mm512_fmadd_ps(wt127, dat1276, sum97);
sum98 = _mm512_fmadd_ps(wt127, dat1277, sum98);
sum99 = _mm512_fmadd_ps(wt127, dat1278, sum99);
__m512 wt128 = _mm512_set1_ps(*(float*)(arrangedWts2+49664*i22+2328*k73+24*s14+(ptrdiff_t)44));
sum100 = _mm512_fmadd_ps(wt128, dat1275, sum100);
sum101 = _mm512_fmadd_ps(wt128, dat1276, sum101);
sum102 = _mm512_fmadd_ps(wt128, dat1277, sum102);
sum103 = _mm512_fmadd_ps(wt128, dat1278, sum103);
}
sum80 = _mm512_max_ps(_mm512_setzero_ps(), sum80);
sum81 = _mm512_max_ps(_mm512_setzero_ps(), sum81);
sum82 = _mm512_max_ps(_mm512_setzero_ps(), sum82);
sum83 = _mm512_max_ps(_mm512_setzero_ps(), sum83);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)0, 65535, sum80);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)64, 65535, sum81);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)128, 65535, sum82);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)192, 65535, sum83);
sum84 = _mm512_max_ps(_mm512_setzero_ps(), sum84);
sum85 = _mm512_max_ps(_mm512_setzero_ps(), sum85);
sum86 = _mm512_max_ps(_mm512_setzero_ps(), sum86);
sum87 = _mm512_max_ps(_mm512_setzero_ps(), sum87);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)12608, 65535, sum84);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)12672, 65535, sum85);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)12736, 65535, sum86);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)12800, 65535, sum87);
sum88 = _mm512_max_ps(_mm512_setzero_ps(), sum88);
sum89 = _mm512_max_ps(_mm512_setzero_ps(), sum89);
sum90 = _mm512_max_ps(_mm512_setzero_ps(), sum90);
sum91 = _mm512_max_ps(_mm512_setzero_ps(), sum91);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)25216, 65535, sum88);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)25280, 65535, sum89);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)25344, 65535, sum90);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)25408, 65535, sum91);
sum92 = _mm512_max_ps(_mm512_setzero_ps(), sum92);
sum93 = _mm512_max_ps(_mm512_setzero_ps(), sum93);
sum94 = _mm512_max_ps(_mm512_setzero_ps(), sum94);
sum95 = _mm512_max_ps(_mm512_setzero_ps(), sum95);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)37824, 65535, sum92);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)37888, 65535, sum93);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)37952, 65535, sum94);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)38016, 65535, sum95);
sum96 = _mm512_max_ps(_mm512_setzero_ps(), sum96);
sum97 = _mm512_max_ps(_mm512_setzero_ps(), sum97);
sum98 = _mm512_max_ps(_mm512_setzero_ps(), sum98);
sum99 = _mm512_max_ps(_mm512_setzero_ps(), sum99);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)50432, 65535, sum96);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)50496, 65535, sum97);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)50560, 65535, sum98);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)50624, 65535, sum99);
sum100 = _mm512_max_ps(_mm512_setzero_ps(), sum100);
sum101 = _mm512_max_ps(_mm512_setzero_ps(), sum101);
sum102 = _mm512_max_ps(_mm512_setzero_ps(), sum102);
sum103 = _mm512_max_ps(_mm512_setzero_ps(), sum103);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)63040, 65535, sum100);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)63104, 65535, sum101);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)63168, 65535, sum102);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)63232, 65535, sum103);
if (k73 >= kk28) return;
}
ptrdiff_t s15 = -1;
__m512 sum104 = _mm512_set1_ps(*(float*)(arrangedWts2+49664*i22+2328*k73+8*s15+(ptrdiff_t)8));
__m512 sum108 = _mm512_set1_ps(*(float*)(arrangedWts2+49664*i22+2328*k73+8*s15+(ptrdiff_t)12));
__m512 sum105 = sum104;
__m512 sum106 = sum104;
__m512 sum107 = sum104;
__m512 sum109 = sum108;
__m512 sum110 = sum108;
__m512 sum111 = sum108;
for (s15 = 0; s15 < 96; ++s15) {
__m512 dat1279 = _mm512_loadu_ps(arrangedDats2+1204224*i22+24576*j16+256*s15+(ptrdiff_t)0);
__m512 dat1280 = _mm512_loadu_ps(arrangedDats2+1204224*i22+24576*j16+256*s15+(ptrdiff_t)64);
__m512 dat1281 = _mm512_loadu_ps(arrangedDats2+1204224*i22+24576*j16+256*s15+(ptrdiff_t)128);
__m512 dat1282 = _mm512_loadu_ps(arrangedDats2+1204224*i22+24576*j16+256*s15+(ptrdiff_t)192);
__m512 wt129 = _mm512_set1_ps(*(float*)(arrangedWts2+49664*i22+2328*k73+8*s15+(ptrdiff_t)8));
sum104 = _mm512_fmadd_ps(wt129, dat1279, sum104);
sum105 = _mm512_fmadd_ps(wt129, dat1280, sum105);
sum106 = _mm512_fmadd_ps(wt129, dat1281, sum106);
sum107 = _mm512_fmadd_ps(wt129, dat1282, sum107);
__m512 wt130 = _mm512_set1_ps(*(float*)(arrangedWts2+49664*i22+2328*k73+8*s15+(ptrdiff_t)12));
sum108 = _mm512_fmadd_ps(wt130, dat1279, sum108);
sum109 = _mm512_fmadd_ps(wt130, dat1280, sum109);
sum110 = _mm512_fmadd_ps(wt130, dat1281, sum110);
sum111 = _mm512_fmadd_ps(wt130, dat1282, sum111);
}
sum104 = _mm512_max_ps(_mm512_setzero_ps(), sum104);
sum105 = _mm512_max_ps(_mm512_setzero_ps(), sum105);
sum106 = _mm512_max_ps(_mm512_setzero_ps(), sum106);
sum107 = _mm512_max_ps(_mm512_setzero_ps(), sum107);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)0, 65535, sum104);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)64, 65535, sum105);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)128, 65535, sum106);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)192, 65535, sum107);
sum108 = _mm512_max_ps(_mm512_setzero_ps(), sum108);
sum109 = _mm512_max_ps(_mm512_setzero_ps(), sum109);
sum110 = _mm512_max_ps(_mm512_setzero_ps(), sum110);
sum111 = _mm512_max_ps(_mm512_setzero_ps(), sum111);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)12608, 65535, sum108);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)12672, 65535, sum109);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)12736, 65535, sum110);
_mm512_mask_storeu_ps(datPtr8+1613824*i22+256*j16+75648*k73+(ptrdiff_t)12800, 65535, sum111);
if (j16 >= jj25) return;
}
}
}

static void DenseNet121OneApply2(DenseNet121ThreaderTeam1* team28, char** tensors29) {
void* pair5[] = {tensors29, 0};
DenseNet121ThreaderTask1 task33;
task33.callee1 = DenseNet121OneApply2Callee1;
task33.any1 = pair5;
task33.nd1 = 3;
task33.hull1[0] = 3;
task33.hull1[1] = 49;
task33.hull1[2] = 1;
DenseNet121ThreaderDo1(team28, &task33);
}

static void DenseNet121OneArrangeWts3Callee1(DenseNet121ThreaderTask1* task34, int64_t* pt22) {
char** tensors32 = task34->any1;
ptrdiff_t b47 = pt22[0];
char*restrict wtPtr5 = tensors32[0]+(ptrdiff_t)3340*0+(ptrdiff_t)65536*0;
char*restrict biasPtr5 = tensors32[1]+(ptrdiff_t)512*0;
char*restrict bnPtr6 = tensors32[2]+(ptrdiff_t)8*128*0;
char*restrict arranged5 = tensors32[3]+(ptrdiff_t)428032*0+(ptrdiff_t)66048*0;
ptrdiff_t ii7 = 1;
for (ptrdiff_t i23 = 0; i23 < ii7; ++i23) {
ptrdiff_t j17 = 4*b47;
ptrdiff_t jj26 = j17+4;
for (; j17 < jj26; ++j17) {
if (j17 < 7) {
ptrdiff_t k75 = 0+16*(j17-0);
ptrdiff_t l25 = (size_t)(0+k75)/6;
ptrdiff_t cut7 = (size_t)(0+k75)%6;
switch (cut7) {
case 0:;
case 2: {
__m512 sum113 = _mm512_maskz_loadu_ps(65535, biasPtr5+512*i23+4*k75);
__m512i pmMul8 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd8 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo7 = _mm512_loadu_ps(bnPtr6+(ptrdiff_t)8*(k75+128*i23));
__m512 masHi7 = _mm512_maskz_loadu_ps(65535, bnPtr6+(ptrdiff_t)8*(k75+128*i23)+(ptrdiff_t)64);
__m512 postMul11 = _mm512_permutex2var_ps(masLo7, pmMul8, masHi7);
__m512 postAdd9 = _mm512_permutex2var_ps(masLo7, pmAdd8, masHi7);
sum113 = _mm512_fmadd_ps(sum113, postMul11, postAdd9);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*0+(ptrdiff_t)0, 63>>cut7, sum113);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*0+(ptrdiff_t)3072, 4032>>cut7, sum113);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*0+(ptrdiff_t)6144, 65535-(4095>>cut7), sum113);
ptrdiff_t c15 = 0;
for (; c15 != 8; ++c15) {
__m512 wt147 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c15+(ptrdiff_t)0);
__m512 wt148 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c15+(ptrdiff_t)512);
__m512 wt149 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c15+(ptrdiff_t)1024);
__m512 wt150 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c15+(ptrdiff_t)1536);
__m512 wt151 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c15+(ptrdiff_t)2048);
__m512 wt152 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c15+(ptrdiff_t)2560);
__m512 wt153 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c15+(ptrdiff_t)3072);
__m512 wt154 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c15+(ptrdiff_t)3584);
__m512 wt155 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c15+(ptrdiff_t)4096);
__m512 wt156 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c15+(ptrdiff_t)4608);
__m512 wt157 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c15+(ptrdiff_t)5120);
__m512 wt158 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c15+(ptrdiff_t)5632);
__m512 wt159 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c15+(ptrdiff_t)6144);
__m512 wt160 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c15+(ptrdiff_t)6656);
__m512 wt161 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c15+(ptrdiff_t)7168);
__m512 wt162 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c15+(ptrdiff_t)7680);
__m512 tmp5301 = _mm512_unpacklo_ps(wt147, wt148);
__m512 tmp5302 = _mm512_unpackhi_ps(wt147, wt148);
__m512 tmp5303 = _mm512_unpacklo_ps(wt149, wt150);
__m512 tmp5304 = _mm512_unpackhi_ps(wt149, wt150);
__m512 tmp5305 = _mm512_unpacklo_ps(wt151, wt152);
__m512 tmp5306 = _mm512_unpackhi_ps(wt151, wt152);
__m512 tmp5307 = _mm512_unpacklo_ps(wt153, wt154);
__m512 tmp5308 = _mm512_unpackhi_ps(wt153, wt154);
__m512 tmp5309 = _mm512_unpacklo_ps(wt155, wt156);
__m512 tmp5310 = _mm512_unpackhi_ps(wt155, wt156);
__m512 tmp5311 = _mm512_unpacklo_ps(wt157, wt158);
__m512 tmp5312 = _mm512_unpackhi_ps(wt157, wt158);
__m512 tmp5313 = _mm512_unpacklo_ps(wt159, wt160);
__m512 tmp5314 = _mm512_unpackhi_ps(wt159, wt160);
__m512 tmp5315 = _mm512_unpacklo_ps(wt161, wt162);
__m512 tmp5316 = _mm512_unpackhi_ps(wt161, wt162);
__m512 tmp5317 = _mm512_shuffle_ps(tmp5301, tmp5303, 68);
__m512 tmp5318 = _mm512_shuffle_ps(tmp5301, tmp5303, 238);
__m512 tmp5319 = _mm512_shuffle_ps(tmp5302, tmp5304, 68);
__m512 tmp5320 = _mm512_shuffle_ps(tmp5302, tmp5304, 238);
__m512 tmp5321 = _mm512_shuffle_ps(tmp5305, tmp5307, 68);
__m512 tmp5322 = _mm512_shuffle_ps(tmp5305, tmp5307, 238);
__m512 tmp5323 = _mm512_shuffle_ps(tmp5306, tmp5308, 68);
__m512 tmp5324 = _mm512_shuffle_ps(tmp5306, tmp5308, 238);
__m512 tmp5325 = _mm512_shuffle_ps(tmp5309, tmp5311, 68);
__m512 tmp5326 = _mm512_shuffle_ps(tmp5309, tmp5311, 238);
__m512 tmp5327 = _mm512_shuffle_ps(tmp5310, tmp5312, 68);
__m512 tmp5328 = _mm512_shuffle_ps(tmp5310, tmp5312, 238);
__m512 tmp5329 = _mm512_shuffle_ps(tmp5313, tmp5315, 68);
__m512 tmp5330 = _mm512_shuffle_ps(tmp5313, tmp5315, 238);
__m512 tmp5331 = _mm512_shuffle_ps(tmp5314, tmp5316, 68);
__m512 tmp5332 = _mm512_shuffle_ps(tmp5314, tmp5316, 238);
__m512 tmp5333 = _mm512_shuffle_f32x4(tmp5317, tmp5321, 136);
__m512 tmp5334 = _mm512_shuffle_f32x4(tmp5317, tmp5321, 221);
__m512 tmp5335 = _mm512_shuffle_f32x4(tmp5318, tmp5322, 136);
__m512 tmp5336 = _mm512_shuffle_f32x4(tmp5318, tmp5322, 221);
__m512 tmp5337 = _mm512_shuffle_f32x4(tmp5319, tmp5323, 136);
__m512 tmp5338 = _mm512_shuffle_f32x4(tmp5319, tmp5323, 221);
__m512 tmp5339 = _mm512_shuffle_f32x4(tmp5320, tmp5324, 136);
__m512 tmp5340 = _mm512_shuffle_f32x4(tmp5320, tmp5324, 221);
__m512 tmp5341 = _mm512_shuffle_f32x4(tmp5325, tmp5329, 136);
__m512 tmp5342 = _mm512_shuffle_f32x4(tmp5325, tmp5329, 221);
__m512 tmp5343 = _mm512_shuffle_f32x4(tmp5326, tmp5330, 136);
__m512 tmp5344 = _mm512_shuffle_f32x4(tmp5326, tmp5330, 221);
__m512 tmp5345 = _mm512_shuffle_f32x4(tmp5327, tmp5331, 136);
__m512 tmp5346 = _mm512_shuffle_f32x4(tmp5327, tmp5331, 221);
__m512 tmp5347 = _mm512_shuffle_f32x4(tmp5328, tmp5332, 136);
__m512 tmp5348 = _mm512_shuffle_f32x4(tmp5328, tmp5332, 221);
wt147 = _mm512_shuffle_f32x4(tmp5333, tmp5341, 136);
wt155 = _mm512_shuffle_f32x4(tmp5333, tmp5341, 221);
wt148 = _mm512_shuffle_f32x4(tmp5335, tmp5343, 136);
wt156 = _mm512_shuffle_f32x4(tmp5335, tmp5343, 221);
wt149 = _mm512_shuffle_f32x4(tmp5337, tmp5345, 136);
wt157 = _mm512_shuffle_f32x4(tmp5337, tmp5345, 221);
wt150 = _mm512_shuffle_f32x4(tmp5339, tmp5347, 136);
wt158 = _mm512_shuffle_f32x4(tmp5339, tmp5347, 221);
wt151 = _mm512_shuffle_f32x4(tmp5334, tmp5342, 136);
wt159 = _mm512_shuffle_f32x4(tmp5334, tmp5342, 221);
wt152 = _mm512_shuffle_f32x4(tmp5336, tmp5344, 136);
wt160 = _mm512_shuffle_f32x4(tmp5336, tmp5344, 221);
wt153 = _mm512_shuffle_f32x4(tmp5338, tmp5346, 136);
wt161 = _mm512_shuffle_f32x4(tmp5338, tmp5346, 221);
wt154 = _mm512_shuffle_f32x4(tmp5340, tmp5348, 136);
wt162 = _mm512_shuffle_f32x4(tmp5340, tmp5348, 221);
wt147 = _mm512_mul_ps(wt147, postMul11);
wt148 = _mm512_mul_ps(wt148, postMul11);
wt149 = _mm512_mul_ps(wt149, postMul11);
wt150 = _mm512_mul_ps(wt150, postMul11);
wt151 = _mm512_mul_ps(wt151, postMul11);
wt152 = _mm512_mul_ps(wt152, postMul11);
wt153 = _mm512_mul_ps(wt153, postMul11);
wt154 = _mm512_mul_ps(wt154, postMul11);
wt155 = _mm512_mul_ps(wt155, postMul11);
wt156 = _mm512_mul_ps(wt156, postMul11);
wt157 = _mm512_mul_ps(wt157, postMul11);
wt158 = _mm512_mul_ps(wt158, postMul11);
wt159 = _mm512_mul_ps(wt159, postMul11);
wt160 = _mm512_mul_ps(wt160, postMul11);
wt161 = _mm512_mul_ps(wt161, postMul11);
wt162 = _mm512_mul_ps(wt162, postMul11);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(1+16*c15)+(ptrdiff_t)0, 63>>cut7, wt147);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(2+16*c15)+(ptrdiff_t)0, 63>>cut7, wt148);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(3+16*c15)+(ptrdiff_t)0, 63>>cut7, wt149);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(4+16*c15)+(ptrdiff_t)0, 63>>cut7, wt150);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(5+16*c15)+(ptrdiff_t)0, 63>>cut7, wt151);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(6+16*c15)+(ptrdiff_t)0, 63>>cut7, wt152);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(7+16*c15)+(ptrdiff_t)0, 63>>cut7, wt153);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(8+16*c15)+(ptrdiff_t)0, 63>>cut7, wt154);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(9+16*c15)+(ptrdiff_t)0, 63>>cut7, wt155);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(10+16*c15)+(ptrdiff_t)0, 63>>cut7, wt156);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(11+16*c15)+(ptrdiff_t)0, 63>>cut7, wt157);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(12+16*c15)+(ptrdiff_t)0, 63>>cut7, wt158);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(13+16*c15)+(ptrdiff_t)0, 63>>cut7, wt159);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(14+16*c15)+(ptrdiff_t)0, 63>>cut7, wt160);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(15+16*c15)+(ptrdiff_t)0, 63>>cut7, wt161);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(16+16*c15)+(ptrdiff_t)0, 63>>cut7, wt162);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(1+16*c15)+(ptrdiff_t)3072, 4032>>cut7, wt147);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(2+16*c15)+(ptrdiff_t)3072, 4032>>cut7, wt148);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(3+16*c15)+(ptrdiff_t)3072, 4032>>cut7, wt149);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(4+16*c15)+(ptrdiff_t)3072, 4032>>cut7, wt150);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(5+16*c15)+(ptrdiff_t)3072, 4032>>cut7, wt151);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(6+16*c15)+(ptrdiff_t)3072, 4032>>cut7, wt152);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(7+16*c15)+(ptrdiff_t)3072, 4032>>cut7, wt153);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(8+16*c15)+(ptrdiff_t)3072, 4032>>cut7, wt154);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(9+16*c15)+(ptrdiff_t)3072, 4032>>cut7, wt155);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(10+16*c15)+(ptrdiff_t)3072, 4032>>cut7, wt156);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(11+16*c15)+(ptrdiff_t)3072, 4032>>cut7, wt157);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(12+16*c15)+(ptrdiff_t)3072, 4032>>cut7, wt158);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(13+16*c15)+(ptrdiff_t)3072, 4032>>cut7, wt159);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(14+16*c15)+(ptrdiff_t)3072, 4032>>cut7, wt160);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(15+16*c15)+(ptrdiff_t)3072, 4032>>cut7, wt161);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(16+16*c15)+(ptrdiff_t)3072, 4032>>cut7, wt162);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(1+16*c15)+(ptrdiff_t)6144, 65535-(4095>>cut7), wt147);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(2+16*c15)+(ptrdiff_t)6144, 65535-(4095>>cut7), wt148);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(3+16*c15)+(ptrdiff_t)6144, 65535-(4095>>cut7), wt149);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(4+16*c15)+(ptrdiff_t)6144, 65535-(4095>>cut7), wt150);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(5+16*c15)+(ptrdiff_t)6144, 65535-(4095>>cut7), wt151);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(6+16*c15)+(ptrdiff_t)6144, 65535-(4095>>cut7), wt152);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(7+16*c15)+(ptrdiff_t)6144, 65535-(4095>>cut7), wt153);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(8+16*c15)+(ptrdiff_t)6144, 65535-(4095>>cut7), wt154);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(9+16*c15)+(ptrdiff_t)6144, 65535-(4095>>cut7), wt155);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(10+16*c15)+(ptrdiff_t)6144, 65535-(4095>>cut7), wt156);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(11+16*c15)+(ptrdiff_t)6144, 65535-(4095>>cut7), wt157);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(12+16*c15)+(ptrdiff_t)6144, 65535-(4095>>cut7), wt158);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(13+16*c15)+(ptrdiff_t)6144, 65535-(4095>>cut7), wt159);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(14+16*c15)+(ptrdiff_t)6144, 65535-(4095>>cut7), wt160);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(15+16*c15)+(ptrdiff_t)6144, 65535-(4095>>cut7), wt161);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(16+16*c15)+(ptrdiff_t)6144, 65535-(4095>>cut7), wt162);
}
break;
}
default: {
cut7 = 4;
__m512 sum114 = _mm512_maskz_loadu_ps(65535, biasPtr5+512*i23+4*k75);
__m512i pmMul9 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd9 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo8 = _mm512_loadu_ps(bnPtr6+(ptrdiff_t)8*(k75+128*i23));
__m512 masHi8 = _mm512_maskz_loadu_ps(65535, bnPtr6+(ptrdiff_t)8*(k75+128*i23)+(ptrdiff_t)64);
__m512 postMul12 = _mm512_permutex2var_ps(masLo8, pmMul9, masHi8);
__m512 postAdd10 = _mm512_permutex2var_ps(masLo8, pmAdd9, masHi8);
sum114 = _mm512_fmadd_ps(sum114, postMul12, postAdd10);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*0+(ptrdiff_t)0, 63>>cut7, sum114);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*0+(ptrdiff_t)3072, 4032>>cut7, sum114);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*0+(ptrdiff_t)6144, 258048>>cut7, sum114);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*0+(ptrdiff_t)9216, 65535-(262143>>cut7), sum114);
ptrdiff_t c16 = 0;
for (; c16 != 8; ++c16) {
__m512 wt163 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c16+(ptrdiff_t)0);
__m512 wt164 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c16+(ptrdiff_t)512);
__m512 wt165 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c16+(ptrdiff_t)1024);
__m512 wt166 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c16+(ptrdiff_t)1536);
__m512 wt167 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c16+(ptrdiff_t)2048);
__m512 wt168 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c16+(ptrdiff_t)2560);
__m512 wt169 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c16+(ptrdiff_t)3072);
__m512 wt170 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c16+(ptrdiff_t)3584);
__m512 wt171 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c16+(ptrdiff_t)4096);
__m512 wt172 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c16+(ptrdiff_t)4608);
__m512 wt173 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c16+(ptrdiff_t)5120);
__m512 wt174 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c16+(ptrdiff_t)5632);
__m512 wt175 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c16+(ptrdiff_t)6144);
__m512 wt176 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c16+(ptrdiff_t)6656);
__m512 wt177 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c16+(ptrdiff_t)7168);
__m512 wt178 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k75+64*c16+(ptrdiff_t)7680);
__m512 tmp5349 = _mm512_unpacklo_ps(wt163, wt164);
__m512 tmp5350 = _mm512_unpackhi_ps(wt163, wt164);
__m512 tmp5351 = _mm512_unpacklo_ps(wt165, wt166);
__m512 tmp5352 = _mm512_unpackhi_ps(wt165, wt166);
__m512 tmp5353 = _mm512_unpacklo_ps(wt167, wt168);
__m512 tmp5354 = _mm512_unpackhi_ps(wt167, wt168);
__m512 tmp5355 = _mm512_unpacklo_ps(wt169, wt170);
__m512 tmp5356 = _mm512_unpackhi_ps(wt169, wt170);
__m512 tmp5357 = _mm512_unpacklo_ps(wt171, wt172);
__m512 tmp5358 = _mm512_unpackhi_ps(wt171, wt172);
__m512 tmp5359 = _mm512_unpacklo_ps(wt173, wt174);
__m512 tmp5360 = _mm512_unpackhi_ps(wt173, wt174);
__m512 tmp5361 = _mm512_unpacklo_ps(wt175, wt176);
__m512 tmp5362 = _mm512_unpackhi_ps(wt175, wt176);
__m512 tmp5363 = _mm512_unpacklo_ps(wt177, wt178);
__m512 tmp5364 = _mm512_unpackhi_ps(wt177, wt178);
__m512 tmp5365 = _mm512_shuffle_ps(tmp5349, tmp5351, 68);
__m512 tmp5366 = _mm512_shuffle_ps(tmp5349, tmp5351, 238);
__m512 tmp5367 = _mm512_shuffle_ps(tmp5350, tmp5352, 68);
__m512 tmp5368 = _mm512_shuffle_ps(tmp5350, tmp5352, 238);
__m512 tmp5369 = _mm512_shuffle_ps(tmp5353, tmp5355, 68);
__m512 tmp5370 = _mm512_shuffle_ps(tmp5353, tmp5355, 238);
__m512 tmp5371 = _mm512_shuffle_ps(tmp5354, tmp5356, 68);
__m512 tmp5372 = _mm512_shuffle_ps(tmp5354, tmp5356, 238);
__m512 tmp5373 = _mm512_shuffle_ps(tmp5357, tmp5359, 68);
__m512 tmp5374 = _mm512_shuffle_ps(tmp5357, tmp5359, 238);
__m512 tmp5375 = _mm512_shuffle_ps(tmp5358, tmp5360, 68);
__m512 tmp5376 = _mm512_shuffle_ps(tmp5358, tmp5360, 238);
__m512 tmp5377 = _mm512_shuffle_ps(tmp5361, tmp5363, 68);
__m512 tmp5378 = _mm512_shuffle_ps(tmp5361, tmp5363, 238);
__m512 tmp5379 = _mm512_shuffle_ps(tmp5362, tmp5364, 68);
__m512 tmp5380 = _mm512_shuffle_ps(tmp5362, tmp5364, 238);
__m512 tmp5381 = _mm512_shuffle_f32x4(tmp5365, tmp5369, 136);
__m512 tmp5382 = _mm512_shuffle_f32x4(tmp5365, tmp5369, 221);
__m512 tmp5383 = _mm512_shuffle_f32x4(tmp5366, tmp5370, 136);
__m512 tmp5384 = _mm512_shuffle_f32x4(tmp5366, tmp5370, 221);
__m512 tmp5385 = _mm512_shuffle_f32x4(tmp5367, tmp5371, 136);
__m512 tmp5386 = _mm512_shuffle_f32x4(tmp5367, tmp5371, 221);
__m512 tmp5387 = _mm512_shuffle_f32x4(tmp5368, tmp5372, 136);
__m512 tmp5388 = _mm512_shuffle_f32x4(tmp5368, tmp5372, 221);
__m512 tmp5389 = _mm512_shuffle_f32x4(tmp5373, tmp5377, 136);
__m512 tmp5390 = _mm512_shuffle_f32x4(tmp5373, tmp5377, 221);
__m512 tmp5391 = _mm512_shuffle_f32x4(tmp5374, tmp5378, 136);
__m512 tmp5392 = _mm512_shuffle_f32x4(tmp5374, tmp5378, 221);
__m512 tmp5393 = _mm512_shuffle_f32x4(tmp5375, tmp5379, 136);
__m512 tmp5394 = _mm512_shuffle_f32x4(tmp5375, tmp5379, 221);
__m512 tmp5395 = _mm512_shuffle_f32x4(tmp5376, tmp5380, 136);
__m512 tmp5396 = _mm512_shuffle_f32x4(tmp5376, tmp5380, 221);
wt163 = _mm512_shuffle_f32x4(tmp5381, tmp5389, 136);
wt171 = _mm512_shuffle_f32x4(tmp5381, tmp5389, 221);
wt164 = _mm512_shuffle_f32x4(tmp5383, tmp5391, 136);
wt172 = _mm512_shuffle_f32x4(tmp5383, tmp5391, 221);
wt165 = _mm512_shuffle_f32x4(tmp5385, tmp5393, 136);
wt173 = _mm512_shuffle_f32x4(tmp5385, tmp5393, 221);
wt166 = _mm512_shuffle_f32x4(tmp5387, tmp5395, 136);
wt174 = _mm512_shuffle_f32x4(tmp5387, tmp5395, 221);
wt167 = _mm512_shuffle_f32x4(tmp5382, tmp5390, 136);
wt175 = _mm512_shuffle_f32x4(tmp5382, tmp5390, 221);
wt168 = _mm512_shuffle_f32x4(tmp5384, tmp5392, 136);
wt176 = _mm512_shuffle_f32x4(tmp5384, tmp5392, 221);
wt169 = _mm512_shuffle_f32x4(tmp5386, tmp5394, 136);
wt177 = _mm512_shuffle_f32x4(tmp5386, tmp5394, 221);
wt170 = _mm512_shuffle_f32x4(tmp5388, tmp5396, 136);
wt178 = _mm512_shuffle_f32x4(tmp5388, tmp5396, 221);
wt163 = _mm512_mul_ps(wt163, postMul12);
wt164 = _mm512_mul_ps(wt164, postMul12);
wt165 = _mm512_mul_ps(wt165, postMul12);
wt166 = _mm512_mul_ps(wt166, postMul12);
wt167 = _mm512_mul_ps(wt167, postMul12);
wt168 = _mm512_mul_ps(wt168, postMul12);
wt169 = _mm512_mul_ps(wt169, postMul12);
wt170 = _mm512_mul_ps(wt170, postMul12);
wt171 = _mm512_mul_ps(wt171, postMul12);
wt172 = _mm512_mul_ps(wt172, postMul12);
wt173 = _mm512_mul_ps(wt173, postMul12);
wt174 = _mm512_mul_ps(wt174, postMul12);
wt175 = _mm512_mul_ps(wt175, postMul12);
wt176 = _mm512_mul_ps(wt176, postMul12);
wt177 = _mm512_mul_ps(wt177, postMul12);
wt178 = _mm512_mul_ps(wt178, postMul12);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(1+16*c16)+(ptrdiff_t)0, 63>>cut7, wt163);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(2+16*c16)+(ptrdiff_t)0, 63>>cut7, wt164);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(3+16*c16)+(ptrdiff_t)0, 63>>cut7, wt165);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(4+16*c16)+(ptrdiff_t)0, 63>>cut7, wt166);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(5+16*c16)+(ptrdiff_t)0, 63>>cut7, wt167);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(6+16*c16)+(ptrdiff_t)0, 63>>cut7, wt168);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(7+16*c16)+(ptrdiff_t)0, 63>>cut7, wt169);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(8+16*c16)+(ptrdiff_t)0, 63>>cut7, wt170);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(9+16*c16)+(ptrdiff_t)0, 63>>cut7, wt171);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(10+16*c16)+(ptrdiff_t)0, 63>>cut7, wt172);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(11+16*c16)+(ptrdiff_t)0, 63>>cut7, wt173);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(12+16*c16)+(ptrdiff_t)0, 63>>cut7, wt174);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(13+16*c16)+(ptrdiff_t)0, 63>>cut7, wt175);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(14+16*c16)+(ptrdiff_t)0, 63>>cut7, wt176);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(15+16*c16)+(ptrdiff_t)0, 63>>cut7, wt177);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(16+16*c16)+(ptrdiff_t)0, 63>>cut7, wt178);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(1+16*c16)+(ptrdiff_t)3072, 4032>>cut7, wt163);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(2+16*c16)+(ptrdiff_t)3072, 4032>>cut7, wt164);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(3+16*c16)+(ptrdiff_t)3072, 4032>>cut7, wt165);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(4+16*c16)+(ptrdiff_t)3072, 4032>>cut7, wt166);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(5+16*c16)+(ptrdiff_t)3072, 4032>>cut7, wt167);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(6+16*c16)+(ptrdiff_t)3072, 4032>>cut7, wt168);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(7+16*c16)+(ptrdiff_t)3072, 4032>>cut7, wt169);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(8+16*c16)+(ptrdiff_t)3072, 4032>>cut7, wt170);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(9+16*c16)+(ptrdiff_t)3072, 4032>>cut7, wt171);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(10+16*c16)+(ptrdiff_t)3072, 4032>>cut7, wt172);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(11+16*c16)+(ptrdiff_t)3072, 4032>>cut7, wt173);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(12+16*c16)+(ptrdiff_t)3072, 4032>>cut7, wt174);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(13+16*c16)+(ptrdiff_t)3072, 4032>>cut7, wt175);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(14+16*c16)+(ptrdiff_t)3072, 4032>>cut7, wt176);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(15+16*c16)+(ptrdiff_t)3072, 4032>>cut7, wt177);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(16+16*c16)+(ptrdiff_t)3072, 4032>>cut7, wt178);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(1+16*c16)+(ptrdiff_t)6144, 258048>>cut7, wt163);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(2+16*c16)+(ptrdiff_t)6144, 258048>>cut7, wt164);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(3+16*c16)+(ptrdiff_t)6144, 258048>>cut7, wt165);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(4+16*c16)+(ptrdiff_t)6144, 258048>>cut7, wt166);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(5+16*c16)+(ptrdiff_t)6144, 258048>>cut7, wt167);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(6+16*c16)+(ptrdiff_t)6144, 258048>>cut7, wt168);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(7+16*c16)+(ptrdiff_t)6144, 258048>>cut7, wt169);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(8+16*c16)+(ptrdiff_t)6144, 258048>>cut7, wt170);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(9+16*c16)+(ptrdiff_t)6144, 258048>>cut7, wt171);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(10+16*c16)+(ptrdiff_t)6144, 258048>>cut7, wt172);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(11+16*c16)+(ptrdiff_t)6144, 258048>>cut7, wt173);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(12+16*c16)+(ptrdiff_t)6144, 258048>>cut7, wt174);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(13+16*c16)+(ptrdiff_t)6144, 258048>>cut7, wt175);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(14+16*c16)+(ptrdiff_t)6144, 258048>>cut7, wt176);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(15+16*c16)+(ptrdiff_t)6144, 258048>>cut7, wt177);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(16+16*c16)+(ptrdiff_t)6144, 258048>>cut7, wt178);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(1+16*c16)+(ptrdiff_t)9216, 65535-(262143>>cut7), wt163);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(2+16*c16)+(ptrdiff_t)9216, 65535-(262143>>cut7), wt164);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(3+16*c16)+(ptrdiff_t)9216, 65535-(262143>>cut7), wt165);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(4+16*c16)+(ptrdiff_t)9216, 65535-(262143>>cut7), wt166);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(5+16*c16)+(ptrdiff_t)9216, 65535-(262143>>cut7), wt167);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(6+16*c16)+(ptrdiff_t)9216, 65535-(262143>>cut7), wt168);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(7+16*c16)+(ptrdiff_t)9216, 65535-(262143>>cut7), wt169);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(8+16*c16)+(ptrdiff_t)9216, 65535-(262143>>cut7), wt170);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(9+16*c16)+(ptrdiff_t)9216, 65535-(262143>>cut7), wt171);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(10+16*c16)+(ptrdiff_t)9216, 65535-(262143>>cut7), wt172);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(11+16*c16)+(ptrdiff_t)9216, 65535-(262143>>cut7), wt173);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(12+16*c16)+(ptrdiff_t)9216, 65535-(262143>>cut7), wt174);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(13+16*c16)+(ptrdiff_t)9216, 65535-(262143>>cut7), wt175);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(14+16*c16)+(ptrdiff_t)9216, 65535-(262143>>cut7), wt176);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(15+16*c16)+(ptrdiff_t)9216, 65535-(262143>>cut7), wt177);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l25+4*cut7+24*(16+16*c16)+(ptrdiff_t)9216, 65535-(262143>>cut7), wt178);
}
}
}
} else {
ptrdiff_t k74 = 112;
ptrdiff_t l24 = (size_t)(0+k74)/6;
ptrdiff_t cut6 = (size_t)(0+k74)%6;
__m512 sum112 = _mm512_maskz_loadu_ps(65535, biasPtr5+512*i23+4*k74);
__m512i pmMul10 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd10 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo9 = _mm512_loadu_ps(bnPtr6+(ptrdiff_t)8*(k74+128*i23));
__m512 masHi9 = _mm512_maskz_loadu_ps(65535, bnPtr6+(ptrdiff_t)8*(k74+128*i23)+(ptrdiff_t)64);
__m512 postMul10 = _mm512_permutex2var_ps(masLo9, pmMul10, masHi9);
__m512 postAdd8 = _mm512_permutex2var_ps(masLo9, pmAdd10, masHi9);
sum112 = _mm512_fmadd_ps(sum112, postMul10, postAdd8);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*0+(ptrdiff_t)0, 63>>cut6, sum112);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*0+(ptrdiff_t)3072, 4032>>cut6, sum112);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*0+(ptrdiff_t)6144, 258048>>cut6, sum112);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+8*0+(ptrdiff_t)9216, 65535-(262143>>cut6), sum112);
ptrdiff_t c14 = 0;
for (; c14 != 8; ++c14) {
__m512 wt131 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k74+64*c14+(ptrdiff_t)0);
__m512 wt132 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k74+64*c14+(ptrdiff_t)512);
__m512 wt133 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k74+64*c14+(ptrdiff_t)1024);
__m512 wt134 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k74+64*c14+(ptrdiff_t)1536);
__m512 wt135 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k74+64*c14+(ptrdiff_t)2048);
__m512 wt136 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k74+64*c14+(ptrdiff_t)2560);
__m512 wt137 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k74+64*c14+(ptrdiff_t)3072);
__m512 wt138 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k74+64*c14+(ptrdiff_t)3584);
__m512 wt139 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k74+64*c14+(ptrdiff_t)4096);
__m512 wt140 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k74+64*c14+(ptrdiff_t)4608);
__m512 wt141 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k74+64*c14+(ptrdiff_t)5120);
__m512 wt142 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k74+64*c14+(ptrdiff_t)5632);
__m512 wt143 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k74+64*c14+(ptrdiff_t)6144);
__m512 wt144 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k74+64*c14+(ptrdiff_t)6656);
__m512 wt145 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k74+64*c14+(ptrdiff_t)7168);
__m512 wt146 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i23+512*k74+64*c14+(ptrdiff_t)7680);
__m512 tmp5397 = _mm512_unpacklo_ps(wt131, wt132);
__m512 tmp5398 = _mm512_unpackhi_ps(wt131, wt132);
__m512 tmp5399 = _mm512_unpacklo_ps(wt133, wt134);
__m512 tmp5400 = _mm512_unpackhi_ps(wt133, wt134);
__m512 tmp5401 = _mm512_unpacklo_ps(wt135, wt136);
__m512 tmp5402 = _mm512_unpackhi_ps(wt135, wt136);
__m512 tmp5403 = _mm512_unpacklo_ps(wt137, wt138);
__m512 tmp5404 = _mm512_unpackhi_ps(wt137, wt138);
__m512 tmp5405 = _mm512_unpacklo_ps(wt139, wt140);
__m512 tmp5406 = _mm512_unpackhi_ps(wt139, wt140);
__m512 tmp5407 = _mm512_unpacklo_ps(wt141, wt142);
__m512 tmp5408 = _mm512_unpackhi_ps(wt141, wt142);
__m512 tmp5409 = _mm512_unpacklo_ps(wt143, wt144);
__m512 tmp5410 = _mm512_unpackhi_ps(wt143, wt144);
__m512 tmp5411 = _mm512_unpacklo_ps(wt145, wt146);
__m512 tmp5412 = _mm512_unpackhi_ps(wt145, wt146);
__m512 tmp5413 = _mm512_shuffle_ps(tmp5397, tmp5399, 68);
__m512 tmp5414 = _mm512_shuffle_ps(tmp5397, tmp5399, 238);
__m512 tmp5415 = _mm512_shuffle_ps(tmp5398, tmp5400, 68);
__m512 tmp5416 = _mm512_shuffle_ps(tmp5398, tmp5400, 238);
__m512 tmp5417 = _mm512_shuffle_ps(tmp5401, tmp5403, 68);
__m512 tmp5418 = _mm512_shuffle_ps(tmp5401, tmp5403, 238);
__m512 tmp5419 = _mm512_shuffle_ps(tmp5402, tmp5404, 68);
__m512 tmp5420 = _mm512_shuffle_ps(tmp5402, tmp5404, 238);
__m512 tmp5421 = _mm512_shuffle_ps(tmp5405, tmp5407, 68);
__m512 tmp5422 = _mm512_shuffle_ps(tmp5405, tmp5407, 238);
__m512 tmp5423 = _mm512_shuffle_ps(tmp5406, tmp5408, 68);
__m512 tmp5424 = _mm512_shuffle_ps(tmp5406, tmp5408, 238);
__m512 tmp5425 = _mm512_shuffle_ps(tmp5409, tmp5411, 68);
__m512 tmp5426 = _mm512_shuffle_ps(tmp5409, tmp5411, 238);
__m512 tmp5427 = _mm512_shuffle_ps(tmp5410, tmp5412, 68);
__m512 tmp5428 = _mm512_shuffle_ps(tmp5410, tmp5412, 238);
__m512 tmp5429 = _mm512_shuffle_f32x4(tmp5413, tmp5417, 136);
__m512 tmp5430 = _mm512_shuffle_f32x4(tmp5413, tmp5417, 221);
__m512 tmp5431 = _mm512_shuffle_f32x4(tmp5414, tmp5418, 136);
__m512 tmp5432 = _mm512_shuffle_f32x4(tmp5414, tmp5418, 221);
__m512 tmp5433 = _mm512_shuffle_f32x4(tmp5415, tmp5419, 136);
__m512 tmp5434 = _mm512_shuffle_f32x4(tmp5415, tmp5419, 221);
__m512 tmp5435 = _mm512_shuffle_f32x4(tmp5416, tmp5420, 136);
__m512 tmp5436 = _mm512_shuffle_f32x4(tmp5416, tmp5420, 221);
__m512 tmp5437 = _mm512_shuffle_f32x4(tmp5421, tmp5425, 136);
__m512 tmp5438 = _mm512_shuffle_f32x4(tmp5421, tmp5425, 221);
__m512 tmp5439 = _mm512_shuffle_f32x4(tmp5422, tmp5426, 136);
__m512 tmp5440 = _mm512_shuffle_f32x4(tmp5422, tmp5426, 221);
__m512 tmp5441 = _mm512_shuffle_f32x4(tmp5423, tmp5427, 136);
__m512 tmp5442 = _mm512_shuffle_f32x4(tmp5423, tmp5427, 221);
__m512 tmp5443 = _mm512_shuffle_f32x4(tmp5424, tmp5428, 136);
__m512 tmp5444 = _mm512_shuffle_f32x4(tmp5424, tmp5428, 221);
wt131 = _mm512_shuffle_f32x4(tmp5429, tmp5437, 136);
wt139 = _mm512_shuffle_f32x4(tmp5429, tmp5437, 221);
wt132 = _mm512_shuffle_f32x4(tmp5431, tmp5439, 136);
wt140 = _mm512_shuffle_f32x4(tmp5431, tmp5439, 221);
wt133 = _mm512_shuffle_f32x4(tmp5433, tmp5441, 136);
wt141 = _mm512_shuffle_f32x4(tmp5433, tmp5441, 221);
wt134 = _mm512_shuffle_f32x4(tmp5435, tmp5443, 136);
wt142 = _mm512_shuffle_f32x4(tmp5435, tmp5443, 221);
wt135 = _mm512_shuffle_f32x4(tmp5430, tmp5438, 136);
wt143 = _mm512_shuffle_f32x4(tmp5430, tmp5438, 221);
wt136 = _mm512_shuffle_f32x4(tmp5432, tmp5440, 136);
wt144 = _mm512_shuffle_f32x4(tmp5432, tmp5440, 221);
wt137 = _mm512_shuffle_f32x4(tmp5434, tmp5442, 136);
wt145 = _mm512_shuffle_f32x4(tmp5434, tmp5442, 221);
wt138 = _mm512_shuffle_f32x4(tmp5436, tmp5444, 136);
wt146 = _mm512_shuffle_f32x4(tmp5436, tmp5444, 221);
wt131 = _mm512_mul_ps(wt131, postMul10);
wt132 = _mm512_mul_ps(wt132, postMul10);
wt133 = _mm512_mul_ps(wt133, postMul10);
wt134 = _mm512_mul_ps(wt134, postMul10);
wt135 = _mm512_mul_ps(wt135, postMul10);
wt136 = _mm512_mul_ps(wt136, postMul10);
wt137 = _mm512_mul_ps(wt137, postMul10);
wt138 = _mm512_mul_ps(wt138, postMul10);
wt139 = _mm512_mul_ps(wt139, postMul10);
wt140 = _mm512_mul_ps(wt140, postMul10);
wt141 = _mm512_mul_ps(wt141, postMul10);
wt142 = _mm512_mul_ps(wt142, postMul10);
wt143 = _mm512_mul_ps(wt143, postMul10);
wt144 = _mm512_mul_ps(wt144, postMul10);
wt145 = _mm512_mul_ps(wt145, postMul10);
wt146 = _mm512_mul_ps(wt146, postMul10);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(1+16*c14)+(ptrdiff_t)0, 63>>cut6, wt131);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(2+16*c14)+(ptrdiff_t)0, 63>>cut6, wt132);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(3+16*c14)+(ptrdiff_t)0, 63>>cut6, wt133);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(4+16*c14)+(ptrdiff_t)0, 63>>cut6, wt134);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(5+16*c14)+(ptrdiff_t)0, 63>>cut6, wt135);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(6+16*c14)+(ptrdiff_t)0, 63>>cut6, wt136);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(7+16*c14)+(ptrdiff_t)0, 63>>cut6, wt137);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(8+16*c14)+(ptrdiff_t)0, 63>>cut6, wt138);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(9+16*c14)+(ptrdiff_t)0, 63>>cut6, wt139);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(10+16*c14)+(ptrdiff_t)0, 63>>cut6, wt140);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(11+16*c14)+(ptrdiff_t)0, 63>>cut6, wt141);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(12+16*c14)+(ptrdiff_t)0, 63>>cut6, wt142);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(13+16*c14)+(ptrdiff_t)0, 63>>cut6, wt143);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(14+16*c14)+(ptrdiff_t)0, 63>>cut6, wt144);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(15+16*c14)+(ptrdiff_t)0, 63>>cut6, wt145);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(16+16*c14)+(ptrdiff_t)0, 63>>cut6, wt146);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(1+16*c14)+(ptrdiff_t)3072, 4032>>cut6, wt131);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(2+16*c14)+(ptrdiff_t)3072, 4032>>cut6, wt132);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(3+16*c14)+(ptrdiff_t)3072, 4032>>cut6, wt133);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(4+16*c14)+(ptrdiff_t)3072, 4032>>cut6, wt134);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(5+16*c14)+(ptrdiff_t)3072, 4032>>cut6, wt135);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(6+16*c14)+(ptrdiff_t)3072, 4032>>cut6, wt136);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(7+16*c14)+(ptrdiff_t)3072, 4032>>cut6, wt137);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(8+16*c14)+(ptrdiff_t)3072, 4032>>cut6, wt138);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(9+16*c14)+(ptrdiff_t)3072, 4032>>cut6, wt139);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(10+16*c14)+(ptrdiff_t)3072, 4032>>cut6, wt140);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(11+16*c14)+(ptrdiff_t)3072, 4032>>cut6, wt141);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(12+16*c14)+(ptrdiff_t)3072, 4032>>cut6, wt142);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(13+16*c14)+(ptrdiff_t)3072, 4032>>cut6, wt143);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(14+16*c14)+(ptrdiff_t)3072, 4032>>cut6, wt144);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(15+16*c14)+(ptrdiff_t)3072, 4032>>cut6, wt145);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(16+16*c14)+(ptrdiff_t)3072, 4032>>cut6, wt146);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(1+16*c14)+(ptrdiff_t)6144, 258048>>cut6, wt131);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(2+16*c14)+(ptrdiff_t)6144, 258048>>cut6, wt132);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(3+16*c14)+(ptrdiff_t)6144, 258048>>cut6, wt133);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(4+16*c14)+(ptrdiff_t)6144, 258048>>cut6, wt134);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(5+16*c14)+(ptrdiff_t)6144, 258048>>cut6, wt135);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(6+16*c14)+(ptrdiff_t)6144, 258048>>cut6, wt136);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(7+16*c14)+(ptrdiff_t)6144, 258048>>cut6, wt137);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(8+16*c14)+(ptrdiff_t)6144, 258048>>cut6, wt138);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(9+16*c14)+(ptrdiff_t)6144, 258048>>cut6, wt139);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(10+16*c14)+(ptrdiff_t)6144, 258048>>cut6, wt140);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(11+16*c14)+(ptrdiff_t)6144, 258048>>cut6, wt141);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(12+16*c14)+(ptrdiff_t)6144, 258048>>cut6, wt142);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(13+16*c14)+(ptrdiff_t)6144, 258048>>cut6, wt143);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(14+16*c14)+(ptrdiff_t)6144, 258048>>cut6, wt144);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(15+16*c14)+(ptrdiff_t)6144, 258048>>cut6, wt145);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+24*(16+16*c14)+(ptrdiff_t)6144, 258048>>cut6, wt146);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+8*(1+16*c14)+(ptrdiff_t)9216, 65535-(262143>>cut6), wt131);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+8*(2+16*c14)+(ptrdiff_t)9216, 65535-(262143>>cut6), wt132);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+8*(3+16*c14)+(ptrdiff_t)9216, 65535-(262143>>cut6), wt133);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+8*(4+16*c14)+(ptrdiff_t)9216, 65535-(262143>>cut6), wt134);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+8*(5+16*c14)+(ptrdiff_t)9216, 65535-(262143>>cut6), wt135);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+8*(6+16*c14)+(ptrdiff_t)9216, 65535-(262143>>cut6), wt136);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+8*(7+16*c14)+(ptrdiff_t)9216, 65535-(262143>>cut6), wt137);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+8*(8+16*c14)+(ptrdiff_t)9216, 65535-(262143>>cut6), wt138);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+8*(9+16*c14)+(ptrdiff_t)9216, 65535-(262143>>cut6), wt139);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+8*(10+16*c14)+(ptrdiff_t)9216, 65535-(262143>>cut6), wt140);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+8*(11+16*c14)+(ptrdiff_t)9216, 65535-(262143>>cut6), wt141);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+8*(12+16*c14)+(ptrdiff_t)9216, 65535-(262143>>cut6), wt142);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+8*(13+16*c14)+(ptrdiff_t)9216, 65535-(262143>>cut6), wt143);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+8*(14+16*c14)+(ptrdiff_t)9216, 65535-(262143>>cut6), wt144);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+8*(15+16*c14)+(ptrdiff_t)9216, 65535-(262143>>cut6), wt145);
_mm512_mask_storeu_ps(arranged5+66048*i23+3096*l24+4*cut6+8*(16+16*c14)+(ptrdiff_t)9216, 65535-(262143>>cut6), wt146);
}
}
}
}
}

static void DenseNet121OneArrangeWts3(DenseNet121ThreaderTeam1* team29, char** tensors31) {
DenseNet121ThreaderTask1 task35;
task35.callee1 = DenseNet121OneArrangeWts3Callee1;
task35.any1 = tensors31;
task35.nd1 = 3;
task35.hull1[0] = 2;
task35.hull1[1] = 1;
task35.hull1[2] = 1;
DenseNet121ThreaderDo1(team29, &task35);
}

static void DenseNet121OneArrangeDats3Callee1(DenseNet121ThreaderTask1* task36, int64_t* pt23) {
char** tensors34 = task36->any1;
ptrdiff_t c17 = pt23[1];
char*restrict datPtr9 = tensors34[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)1613824*0;
char*restrict bnPtr7 = tensors34[1]+(ptrdiff_t)8*((ptrdiff_t)835*0+(ptrdiff_t)128*0);
char*restrict arranged6 = tensors34[2]+(ptrdiff_t)10474240*0+(ptrdiff_t)1605632*0;
ptrdiff_t ii8 = 1;
for (ptrdiff_t i24 = 0; i24 < ii8; ++i24) {
ptrdiff_t j18 = 1*c17;
ptrdiff_t jj27 = j18+0;
for (; j18 != 49; ++j18) {
ptrdiff_t k76 = 0;
ptrdiff_t kk29 = k76+128;
for (; k76 < kk29; ++k76) {
__m512 dat1283 = _mm512_maskz_loadu_ps(65535, datPtr9+1613824*i24+256*j18+12608*k76+(ptrdiff_t)0);
__m512 dat1284 = _mm512_maskz_loadu_ps(65535, datPtr9+1613824*i24+256*j18+12608*k76+(ptrdiff_t)64);
__m512 dat1285 = _mm512_maskz_loadu_ps(65535, datPtr9+1613824*i24+256*j18+12608*k76+(ptrdiff_t)128);
__m512 dat1286 = _mm512_maskz_loadu_ps(65535, datPtr9+1613824*i24+256*j18+12608*k76+(ptrdiff_t)192);
__m512 bnMul3 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(k76+128*i24))[0]);
__m512 bnAdd3 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(k76+128*i24))[1]);
dat1283 = _mm512_fmadd_ps(dat1283, bnMul3, bnAdd3);
dat1284 = _mm512_fmadd_ps(dat1284, bnMul3, bnAdd3);
dat1285 = _mm512_fmadd_ps(dat1285, bnMul3, bnAdd3);
dat1286 = _mm512_fmadd_ps(dat1286, bnMul3, bnAdd3);
dat1283 = _mm512_max_ps(_mm512_setzero_ps(), dat1283);
dat1284 = _mm512_max_ps(_mm512_setzero_ps(), dat1284);
dat1285 = _mm512_max_ps(_mm512_setzero_ps(), dat1285);
dat1286 = _mm512_max_ps(_mm512_setzero_ps(), dat1286);
_mm512_mask_storeu_ps(arranged6+1605632*i24+32768*j18+256*k76+(ptrdiff_t)0, 65535, dat1283);
_mm512_mask_storeu_ps(arranged6+1605632*i24+32768*j18+256*k76+(ptrdiff_t)64, 65535, dat1284);
_mm512_mask_storeu_ps(arranged6+1605632*i24+32768*j18+256*k76+(ptrdiff_t)128, 65535, dat1285);
_mm512_mask_storeu_ps(arranged6+1605632*i24+32768*j18+256*k76+(ptrdiff_t)192, 65535, dat1286);
}
if (j18 >= jj27) goto next3;
}
next3:;
}
}

static void DenseNet121OneArrangeDats3(DenseNet121ThreaderTeam1* team30, char** tensors33) {
DenseNet121ThreaderTask1 task37;
task37.callee1 = DenseNet121OneArrangeDats3Callee1;
task37.any1 = tensors33;
task37.nd1 = 4;
task37.hull1[0] = 1;
task37.hull1[1] = 49;
task37.hull1[2] = 1;
task37.hull1[3] = 1;
DenseNet121ThreaderDo1(team30, &task37);
}

static void DenseNet121OneApply3Callee1(DenseNet121ThreaderTask1* task38, int64_t* pt24) {
void** pair8 = task38->any1;
char** tensors36 = pair8[0];
ptrdiff_t e11 = 0;
ptrdiff_t g12 = 0;
ptrdiff_t d7 = pt24[1];
ptrdiff_t w35 = pt24[0];
char*restrict arrangedWts3 = tensors36[0]+428032*e11+(ptrdiff_t)66048*1*g12;
char*restrict arrangedDats3 = tensors36[1]+10474240*e11+(ptrdiff_t)1605632*1*g12;
char*restrict datPtr10 = tensors36[2]+(ptrdiff_t)1613824*1*g12;
ptrdiff_t ii9 = 1;
for (ptrdiff_t i25 = 0; i25 < ii9; ++i25) {
ptrdiff_t j19 = 1*d7;
ptrdiff_t jj28 = j19+0;
for (; j19 != 49; ++j19) {
ptrdiff_t k77 = 4*w35;
ptrdiff_t kk30 = k77+(w35 < 4 ? 3 : 5);
for (; k77 != 21; ++k77) {
ptrdiff_t s16 = -1;
__m512 sum115 = _mm512_set1_ps(*(float*)(arrangedWts3+66048*i25+3096*k77+24*s16+(ptrdiff_t)24));
__m512 sum119 = _mm512_set1_ps(*(float*)(arrangedWts3+66048*i25+3096*k77+24*s16+(ptrdiff_t)28));
__m512 sum123 = _mm512_set1_ps(*(float*)(arrangedWts3+66048*i25+3096*k77+24*s16+(ptrdiff_t)32));
__m512 sum127 = _mm512_set1_ps(*(float*)(arrangedWts3+66048*i25+3096*k77+24*s16+(ptrdiff_t)36));
__m512 sum131 = _mm512_set1_ps(*(float*)(arrangedWts3+66048*i25+3096*k77+24*s16+(ptrdiff_t)40));
__m512 sum135 = _mm512_set1_ps(*(float*)(arrangedWts3+66048*i25+3096*k77+24*s16+(ptrdiff_t)44));
__m512 sum116 = sum115;
__m512 sum117 = sum115;
__m512 sum118 = sum115;
__m512 sum120 = sum119;
__m512 sum121 = sum119;
__m512 sum122 = sum119;
__m512 sum124 = sum123;
__m512 sum125 = sum123;
__m512 sum126 = sum123;
__m512 sum128 = sum127;
__m512 sum129 = sum127;
__m512 sum130 = sum127;
__m512 sum132 = sum131;
__m512 sum133 = sum131;
__m512 sum134 = sum131;
__m512 sum136 = sum135;
__m512 sum137 = sum135;
__m512 sum138 = sum135;
for (s16 = 0; s16 < 128; ++s16) {
__m512 dat1287 = _mm512_loadu_ps(arrangedDats3+1605632*i25+32768*j19+256*s16+(ptrdiff_t)0);
__m512 dat1288 = _mm512_loadu_ps(arrangedDats3+1605632*i25+32768*j19+256*s16+(ptrdiff_t)64);
__m512 dat1289 = _mm512_loadu_ps(arrangedDats3+1605632*i25+32768*j19+256*s16+(ptrdiff_t)128);
__m512 dat1290 = _mm512_loadu_ps(arrangedDats3+1605632*i25+32768*j19+256*s16+(ptrdiff_t)192);
__m512 wt179 = _mm512_set1_ps(*(float*)(arrangedWts3+66048*i25+3096*k77+24*s16+(ptrdiff_t)24));
sum115 = _mm512_fmadd_ps(wt179, dat1287, sum115);
sum116 = _mm512_fmadd_ps(wt179, dat1288, sum116);
sum117 = _mm512_fmadd_ps(wt179, dat1289, sum117);
sum118 = _mm512_fmadd_ps(wt179, dat1290, sum118);
__m512 wt180 = _mm512_set1_ps(*(float*)(arrangedWts3+66048*i25+3096*k77+24*s16+(ptrdiff_t)28));
sum119 = _mm512_fmadd_ps(wt180, dat1287, sum119);
sum120 = _mm512_fmadd_ps(wt180, dat1288, sum120);
sum121 = _mm512_fmadd_ps(wt180, dat1289, sum121);
sum122 = _mm512_fmadd_ps(wt180, dat1290, sum122);
__m512 wt181 = _mm512_set1_ps(*(float*)(arrangedWts3+66048*i25+3096*k77+24*s16+(ptrdiff_t)32));
sum123 = _mm512_fmadd_ps(wt181, dat1287, sum123);
sum124 = _mm512_fmadd_ps(wt181, dat1288, sum124);
sum125 = _mm512_fmadd_ps(wt181, dat1289, sum125);
sum126 = _mm512_fmadd_ps(wt181, dat1290, sum126);
__m512 wt182 = _mm512_set1_ps(*(float*)(arrangedWts3+66048*i25+3096*k77+24*s16+(ptrdiff_t)36));
sum127 = _mm512_fmadd_ps(wt182, dat1287, sum127);
sum128 = _mm512_fmadd_ps(wt182, dat1288, sum128);
sum129 = _mm512_fmadd_ps(wt182, dat1289, sum129);
sum130 = _mm512_fmadd_ps(wt182, dat1290, sum130);
__m512 wt183 = _mm512_set1_ps(*(float*)(arrangedWts3+66048*i25+3096*k77+24*s16+(ptrdiff_t)40));
sum131 = _mm512_fmadd_ps(wt183, dat1287, sum131);
sum132 = _mm512_fmadd_ps(wt183, dat1288, sum132);
sum133 = _mm512_fmadd_ps(wt183, dat1289, sum133);
sum134 = _mm512_fmadd_ps(wt183, dat1290, sum134);
__m512 wt184 = _mm512_set1_ps(*(float*)(arrangedWts3+66048*i25+3096*k77+24*s16+(ptrdiff_t)44));
sum135 = _mm512_fmadd_ps(wt184, dat1287, sum135);
sum136 = _mm512_fmadd_ps(wt184, dat1288, sum136);
sum137 = _mm512_fmadd_ps(wt184, dat1289, sum137);
sum138 = _mm512_fmadd_ps(wt184, dat1290, sum138);
}
sum115 = _mm512_max_ps(_mm512_setzero_ps(), sum115);
sum116 = _mm512_max_ps(_mm512_setzero_ps(), sum116);
sum117 = _mm512_max_ps(_mm512_setzero_ps(), sum117);
sum118 = _mm512_max_ps(_mm512_setzero_ps(), sum118);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)0, 65535, sum115);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)64, 65535, sum116);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)128, 65535, sum117);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)192, 65535, sum118);
sum119 = _mm512_max_ps(_mm512_setzero_ps(), sum119);
sum120 = _mm512_max_ps(_mm512_setzero_ps(), sum120);
sum121 = _mm512_max_ps(_mm512_setzero_ps(), sum121);
sum122 = _mm512_max_ps(_mm512_setzero_ps(), sum122);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12608, 65535, sum119);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12672, 65535, sum120);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12736, 65535, sum121);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12800, 65535, sum122);
sum123 = _mm512_max_ps(_mm512_setzero_ps(), sum123);
sum124 = _mm512_max_ps(_mm512_setzero_ps(), sum124);
sum125 = _mm512_max_ps(_mm512_setzero_ps(), sum125);
sum126 = _mm512_max_ps(_mm512_setzero_ps(), sum126);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)25216, 65535, sum123);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)25280, 65535, sum124);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)25344, 65535, sum125);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)25408, 65535, sum126);
sum127 = _mm512_max_ps(_mm512_setzero_ps(), sum127);
sum128 = _mm512_max_ps(_mm512_setzero_ps(), sum128);
sum129 = _mm512_max_ps(_mm512_setzero_ps(), sum129);
sum130 = _mm512_max_ps(_mm512_setzero_ps(), sum130);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)37824, 65535, sum127);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)37888, 65535, sum128);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)37952, 65535, sum129);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)38016, 65535, sum130);
sum131 = _mm512_max_ps(_mm512_setzero_ps(), sum131);
sum132 = _mm512_max_ps(_mm512_setzero_ps(), sum132);
sum133 = _mm512_max_ps(_mm512_setzero_ps(), sum133);
sum134 = _mm512_max_ps(_mm512_setzero_ps(), sum134);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)50432, 65535, sum131);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)50496, 65535, sum132);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)50560, 65535, sum133);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)50624, 65535, sum134);
sum135 = _mm512_max_ps(_mm512_setzero_ps(), sum135);
sum136 = _mm512_max_ps(_mm512_setzero_ps(), sum136);
sum137 = _mm512_max_ps(_mm512_setzero_ps(), sum137);
sum138 = _mm512_max_ps(_mm512_setzero_ps(), sum138);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)63040, 65535, sum135);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)63104, 65535, sum136);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)63168, 65535, sum137);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)63232, 65535, sum138);
if (k77 >= kk30) return;
}
ptrdiff_t s17 = -1;
__m512 sum139 = _mm512_set1_ps(*(float*)(arrangedWts3+66048*i25+3096*k77+8*s17+(ptrdiff_t)8));
__m512 sum143 = _mm512_set1_ps(*(float*)(arrangedWts3+66048*i25+3096*k77+8*s17+(ptrdiff_t)12));
__m512 sum140 = sum139;
__m512 sum141 = sum139;
__m512 sum142 = sum139;
__m512 sum144 = sum143;
__m512 sum145 = sum143;
__m512 sum146 = sum143;
for (s17 = 0; s17 < 128; ++s17) {
__m512 dat1291 = _mm512_loadu_ps(arrangedDats3+1605632*i25+32768*j19+256*s17+(ptrdiff_t)0);
__m512 dat1292 = _mm512_loadu_ps(arrangedDats3+1605632*i25+32768*j19+256*s17+(ptrdiff_t)64);
__m512 dat1293 = _mm512_loadu_ps(arrangedDats3+1605632*i25+32768*j19+256*s17+(ptrdiff_t)128);
__m512 dat1294 = _mm512_loadu_ps(arrangedDats3+1605632*i25+32768*j19+256*s17+(ptrdiff_t)192);
__m512 wt185 = _mm512_set1_ps(*(float*)(arrangedWts3+66048*i25+3096*k77+8*s17+(ptrdiff_t)8));
sum139 = _mm512_fmadd_ps(wt185, dat1291, sum139);
sum140 = _mm512_fmadd_ps(wt185, dat1292, sum140);
sum141 = _mm512_fmadd_ps(wt185, dat1293, sum141);
sum142 = _mm512_fmadd_ps(wt185, dat1294, sum142);
__m512 wt186 = _mm512_set1_ps(*(float*)(arrangedWts3+66048*i25+3096*k77+8*s17+(ptrdiff_t)12));
sum143 = _mm512_fmadd_ps(wt186, dat1291, sum143);
sum144 = _mm512_fmadd_ps(wt186, dat1292, sum144);
sum145 = _mm512_fmadd_ps(wt186, dat1293, sum145);
sum146 = _mm512_fmadd_ps(wt186, dat1294, sum146);
}
sum139 = _mm512_max_ps(_mm512_setzero_ps(), sum139);
sum140 = _mm512_max_ps(_mm512_setzero_ps(), sum140);
sum141 = _mm512_max_ps(_mm512_setzero_ps(), sum141);
sum142 = _mm512_max_ps(_mm512_setzero_ps(), sum142);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)0, 65535, sum139);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)64, 65535, sum140);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)128, 65535, sum141);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)192, 65535, sum142);
sum143 = _mm512_max_ps(_mm512_setzero_ps(), sum143);
sum144 = _mm512_max_ps(_mm512_setzero_ps(), sum144);
sum145 = _mm512_max_ps(_mm512_setzero_ps(), sum145);
sum146 = _mm512_max_ps(_mm512_setzero_ps(), sum146);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12608, 65535, sum143);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12672, 65535, sum144);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12736, 65535, sum145);
_mm512_mask_storeu_ps(datPtr10+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12800, 65535, sum146);
if (j19 >= jj28) return;
}
}
}

static void DenseNet121OneApply3(DenseNet121ThreaderTeam1* team31, char** tensors35) {
void* pair7[] = {tensors35, 0};
DenseNet121ThreaderTask1 task39;
task39.callee1 = DenseNet121OneApply3Callee1;
task39.any1 = pair7;
task39.nd1 = 3;
task39.hull1[0] = 5;
task39.hull1[1] = 49;
task39.hull1[2] = 1;
DenseNet121ThreaderDo1(team31, &task39);
}

static void DenseNet121OneArrangeWts4Callee1(DenseNet121ThreaderTask1* task40, int64_t* pt25) {
char** tensors38 = task40->any1;
ptrdiff_t b48 = pt25[0];
char*restrict wtPtr6 = tensors38[0]+(ptrdiff_t)3340*0+(ptrdiff_t)81920*0;
char*restrict biasPtr6 = tensors38[1]+(ptrdiff_t)512*0;
char*restrict bnPtr8 = tensors38[2]+(ptrdiff_t)8*128*0;
char*restrict arranged7 = tensors38[3]+(ptrdiff_t)428032*0+(ptrdiff_t)82432*0;
ptrdiff_t ii10 = 1;
for (ptrdiff_t i27 = 0; i27 < ii10; ++i27) {
ptrdiff_t j20 = 4*b48;
ptrdiff_t jj29 = j20+4;
for (; j20 < jj29; ++j20) {
if (j20 < 7) {
ptrdiff_t k79 = 0+16*(j20-0);
ptrdiff_t l27 = (size_t)(0+k79)/6;
ptrdiff_t cut9 = (size_t)(0+k79)%6;
switch (cut9) {
case 0:;
case 2: {
__m512 sum148 = _mm512_maskz_loadu_ps(65535, biasPtr6+512*i27+4*k79);
__m512i pmMul11 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd11 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo10 = _mm512_loadu_ps(bnPtr8+(ptrdiff_t)8*(k79+128*i27));
__m512 masHi10 = _mm512_maskz_loadu_ps(65535, bnPtr8+(ptrdiff_t)8*(k79+128*i27)+(ptrdiff_t)64);
__m512 postMul14 = _mm512_permutex2var_ps(masLo10, pmMul11, masHi10);
__m512 postAdd12 = _mm512_permutex2var_ps(masLo10, pmAdd11, masHi10);
sum148 = _mm512_fmadd_ps(sum148, postMul14, postAdd12);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*0+(ptrdiff_t)0, 63>>cut9, sum148);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*0+(ptrdiff_t)3840, 4032>>cut9, sum148);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*0+(ptrdiff_t)7680, 65535-(4095>>cut9), sum148);
ptrdiff_t c19 = 0;
for (; c19 != 10; ++c19) {
__m512 wt203 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c19+(ptrdiff_t)0);
__m512 wt204 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c19+(ptrdiff_t)640);
__m512 wt205 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c19+(ptrdiff_t)1280);
__m512 wt206 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c19+(ptrdiff_t)1920);
__m512 wt207 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c19+(ptrdiff_t)2560);
__m512 wt208 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c19+(ptrdiff_t)3200);
__m512 wt209 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c19+(ptrdiff_t)3840);
__m512 wt210 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c19+(ptrdiff_t)4480);
__m512 wt211 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c19+(ptrdiff_t)5120);
__m512 wt212 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c19+(ptrdiff_t)5760);
__m512 wt213 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c19+(ptrdiff_t)6400);
__m512 wt214 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c19+(ptrdiff_t)7040);
__m512 wt215 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c19+(ptrdiff_t)7680);
__m512 wt216 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c19+(ptrdiff_t)8320);
__m512 wt217 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c19+(ptrdiff_t)8960);
__m512 wt218 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c19+(ptrdiff_t)9600);
__m512 tmp5445 = _mm512_unpacklo_ps(wt203, wt204);
__m512 tmp5446 = _mm512_unpackhi_ps(wt203, wt204);
__m512 tmp5447 = _mm512_unpacklo_ps(wt205, wt206);
__m512 tmp5448 = _mm512_unpackhi_ps(wt205, wt206);
__m512 tmp5449 = _mm512_unpacklo_ps(wt207, wt208);
__m512 tmp5450 = _mm512_unpackhi_ps(wt207, wt208);
__m512 tmp5451 = _mm512_unpacklo_ps(wt209, wt210);
__m512 tmp5452 = _mm512_unpackhi_ps(wt209, wt210);
__m512 tmp5453 = _mm512_unpacklo_ps(wt211, wt212);
__m512 tmp5454 = _mm512_unpackhi_ps(wt211, wt212);
__m512 tmp5455 = _mm512_unpacklo_ps(wt213, wt214);
__m512 tmp5456 = _mm512_unpackhi_ps(wt213, wt214);
__m512 tmp5457 = _mm512_unpacklo_ps(wt215, wt216);
__m512 tmp5458 = _mm512_unpackhi_ps(wt215, wt216);
__m512 tmp5459 = _mm512_unpacklo_ps(wt217, wt218);
__m512 tmp5460 = _mm512_unpackhi_ps(wt217, wt218);
__m512 tmp5461 = _mm512_shuffle_ps(tmp5445, tmp5447, 68);
__m512 tmp5462 = _mm512_shuffle_ps(tmp5445, tmp5447, 238);
__m512 tmp5463 = _mm512_shuffle_ps(tmp5446, tmp5448, 68);
__m512 tmp5464 = _mm512_shuffle_ps(tmp5446, tmp5448, 238);
__m512 tmp5465 = _mm512_shuffle_ps(tmp5449, tmp5451, 68);
__m512 tmp5466 = _mm512_shuffle_ps(tmp5449, tmp5451, 238);
__m512 tmp5467 = _mm512_shuffle_ps(tmp5450, tmp5452, 68);
__m512 tmp5468 = _mm512_shuffle_ps(tmp5450, tmp5452, 238);
__m512 tmp5469 = _mm512_shuffle_ps(tmp5453, tmp5455, 68);
__m512 tmp5470 = _mm512_shuffle_ps(tmp5453, tmp5455, 238);
__m512 tmp5471 = _mm512_shuffle_ps(tmp5454, tmp5456, 68);
__m512 tmp5472 = _mm512_shuffle_ps(tmp5454, tmp5456, 238);
__m512 tmp5473 = _mm512_shuffle_ps(tmp5457, tmp5459, 68);
__m512 tmp5474 = _mm512_shuffle_ps(tmp5457, tmp5459, 238);
__m512 tmp5475 = _mm512_shuffle_ps(tmp5458, tmp5460, 68);
__m512 tmp5476 = _mm512_shuffle_ps(tmp5458, tmp5460, 238);
__m512 tmp5477 = _mm512_shuffle_f32x4(tmp5461, tmp5465, 136);
__m512 tmp5478 = _mm512_shuffle_f32x4(tmp5461, tmp5465, 221);
__m512 tmp5479 = _mm512_shuffle_f32x4(tmp5462, tmp5466, 136);
__m512 tmp5480 = _mm512_shuffle_f32x4(tmp5462, tmp5466, 221);
__m512 tmp5481 = _mm512_shuffle_f32x4(tmp5463, tmp5467, 136);
__m512 tmp5482 = _mm512_shuffle_f32x4(tmp5463, tmp5467, 221);
__m512 tmp5483 = _mm512_shuffle_f32x4(tmp5464, tmp5468, 136);
__m512 tmp5484 = _mm512_shuffle_f32x4(tmp5464, tmp5468, 221);
__m512 tmp5485 = _mm512_shuffle_f32x4(tmp5469, tmp5473, 136);
__m512 tmp5486 = _mm512_shuffle_f32x4(tmp5469, tmp5473, 221);
__m512 tmp5487 = _mm512_shuffle_f32x4(tmp5470, tmp5474, 136);
__m512 tmp5488 = _mm512_shuffle_f32x4(tmp5470, tmp5474, 221);
__m512 tmp5489 = _mm512_shuffle_f32x4(tmp5471, tmp5475, 136);
__m512 tmp5490 = _mm512_shuffle_f32x4(tmp5471, tmp5475, 221);
__m512 tmp5491 = _mm512_shuffle_f32x4(tmp5472, tmp5476, 136);
__m512 tmp5492 = _mm512_shuffle_f32x4(tmp5472, tmp5476, 221);
wt203 = _mm512_shuffle_f32x4(tmp5477, tmp5485, 136);
wt211 = _mm512_shuffle_f32x4(tmp5477, tmp5485, 221);
wt204 = _mm512_shuffle_f32x4(tmp5479, tmp5487, 136);
wt212 = _mm512_shuffle_f32x4(tmp5479, tmp5487, 221);
wt205 = _mm512_shuffle_f32x4(tmp5481, tmp5489, 136);
wt213 = _mm512_shuffle_f32x4(tmp5481, tmp5489, 221);
wt206 = _mm512_shuffle_f32x4(tmp5483, tmp5491, 136);
wt214 = _mm512_shuffle_f32x4(tmp5483, tmp5491, 221);
wt207 = _mm512_shuffle_f32x4(tmp5478, tmp5486, 136);
wt215 = _mm512_shuffle_f32x4(tmp5478, tmp5486, 221);
wt208 = _mm512_shuffle_f32x4(tmp5480, tmp5488, 136);
wt216 = _mm512_shuffle_f32x4(tmp5480, tmp5488, 221);
wt209 = _mm512_shuffle_f32x4(tmp5482, tmp5490, 136);
wt217 = _mm512_shuffle_f32x4(tmp5482, tmp5490, 221);
wt210 = _mm512_shuffle_f32x4(tmp5484, tmp5492, 136);
wt218 = _mm512_shuffle_f32x4(tmp5484, tmp5492, 221);
wt203 = _mm512_mul_ps(wt203, postMul14);
wt204 = _mm512_mul_ps(wt204, postMul14);
wt205 = _mm512_mul_ps(wt205, postMul14);
wt206 = _mm512_mul_ps(wt206, postMul14);
wt207 = _mm512_mul_ps(wt207, postMul14);
wt208 = _mm512_mul_ps(wt208, postMul14);
wt209 = _mm512_mul_ps(wt209, postMul14);
wt210 = _mm512_mul_ps(wt210, postMul14);
wt211 = _mm512_mul_ps(wt211, postMul14);
wt212 = _mm512_mul_ps(wt212, postMul14);
wt213 = _mm512_mul_ps(wt213, postMul14);
wt214 = _mm512_mul_ps(wt214, postMul14);
wt215 = _mm512_mul_ps(wt215, postMul14);
wt216 = _mm512_mul_ps(wt216, postMul14);
wt217 = _mm512_mul_ps(wt217, postMul14);
wt218 = _mm512_mul_ps(wt218, postMul14);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(1+16*c19)+(ptrdiff_t)0, 63>>cut9, wt203);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(2+16*c19)+(ptrdiff_t)0, 63>>cut9, wt204);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(3+16*c19)+(ptrdiff_t)0, 63>>cut9, wt205);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(4+16*c19)+(ptrdiff_t)0, 63>>cut9, wt206);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(5+16*c19)+(ptrdiff_t)0, 63>>cut9, wt207);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(6+16*c19)+(ptrdiff_t)0, 63>>cut9, wt208);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(7+16*c19)+(ptrdiff_t)0, 63>>cut9, wt209);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(8+16*c19)+(ptrdiff_t)0, 63>>cut9, wt210);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(9+16*c19)+(ptrdiff_t)0, 63>>cut9, wt211);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(10+16*c19)+(ptrdiff_t)0, 63>>cut9, wt212);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(11+16*c19)+(ptrdiff_t)0, 63>>cut9, wt213);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(12+16*c19)+(ptrdiff_t)0, 63>>cut9, wt214);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(13+16*c19)+(ptrdiff_t)0, 63>>cut9, wt215);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(14+16*c19)+(ptrdiff_t)0, 63>>cut9, wt216);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(15+16*c19)+(ptrdiff_t)0, 63>>cut9, wt217);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(16+16*c19)+(ptrdiff_t)0, 63>>cut9, wt218);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(1+16*c19)+(ptrdiff_t)3840, 4032>>cut9, wt203);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(2+16*c19)+(ptrdiff_t)3840, 4032>>cut9, wt204);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(3+16*c19)+(ptrdiff_t)3840, 4032>>cut9, wt205);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(4+16*c19)+(ptrdiff_t)3840, 4032>>cut9, wt206);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(5+16*c19)+(ptrdiff_t)3840, 4032>>cut9, wt207);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(6+16*c19)+(ptrdiff_t)3840, 4032>>cut9, wt208);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(7+16*c19)+(ptrdiff_t)3840, 4032>>cut9, wt209);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(8+16*c19)+(ptrdiff_t)3840, 4032>>cut9, wt210);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(9+16*c19)+(ptrdiff_t)3840, 4032>>cut9, wt211);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(10+16*c19)+(ptrdiff_t)3840, 4032>>cut9, wt212);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(11+16*c19)+(ptrdiff_t)3840, 4032>>cut9, wt213);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(12+16*c19)+(ptrdiff_t)3840, 4032>>cut9, wt214);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(13+16*c19)+(ptrdiff_t)3840, 4032>>cut9, wt215);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(14+16*c19)+(ptrdiff_t)3840, 4032>>cut9, wt216);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(15+16*c19)+(ptrdiff_t)3840, 4032>>cut9, wt217);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(16+16*c19)+(ptrdiff_t)3840, 4032>>cut9, wt218);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(1+16*c19)+(ptrdiff_t)7680, 65535-(4095>>cut9), wt203);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(2+16*c19)+(ptrdiff_t)7680, 65535-(4095>>cut9), wt204);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(3+16*c19)+(ptrdiff_t)7680, 65535-(4095>>cut9), wt205);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(4+16*c19)+(ptrdiff_t)7680, 65535-(4095>>cut9), wt206);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(5+16*c19)+(ptrdiff_t)7680, 65535-(4095>>cut9), wt207);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(6+16*c19)+(ptrdiff_t)7680, 65535-(4095>>cut9), wt208);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(7+16*c19)+(ptrdiff_t)7680, 65535-(4095>>cut9), wt209);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(8+16*c19)+(ptrdiff_t)7680, 65535-(4095>>cut9), wt210);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(9+16*c19)+(ptrdiff_t)7680, 65535-(4095>>cut9), wt211);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(10+16*c19)+(ptrdiff_t)7680, 65535-(4095>>cut9), wt212);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(11+16*c19)+(ptrdiff_t)7680, 65535-(4095>>cut9), wt213);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(12+16*c19)+(ptrdiff_t)7680, 65535-(4095>>cut9), wt214);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(13+16*c19)+(ptrdiff_t)7680, 65535-(4095>>cut9), wt215);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(14+16*c19)+(ptrdiff_t)7680, 65535-(4095>>cut9), wt216);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(15+16*c19)+(ptrdiff_t)7680, 65535-(4095>>cut9), wt217);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(16+16*c19)+(ptrdiff_t)7680, 65535-(4095>>cut9), wt218);
}
break;
}
default: {
cut9 = 4;
__m512 sum149 = _mm512_maskz_loadu_ps(65535, biasPtr6+512*i27+4*k79);
__m512i pmMul12 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd12 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo11 = _mm512_loadu_ps(bnPtr8+(ptrdiff_t)8*(k79+128*i27));
__m512 masHi11 = _mm512_maskz_loadu_ps(65535, bnPtr8+(ptrdiff_t)8*(k79+128*i27)+(ptrdiff_t)64);
__m512 postMul15 = _mm512_permutex2var_ps(masLo11, pmMul12, masHi11);
__m512 postAdd13 = _mm512_permutex2var_ps(masLo11, pmAdd12, masHi11);
sum149 = _mm512_fmadd_ps(sum149, postMul15, postAdd13);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*0+(ptrdiff_t)0, 63>>cut9, sum149);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*0+(ptrdiff_t)3840, 4032>>cut9, sum149);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*0+(ptrdiff_t)7680, 258048>>cut9, sum149);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*0+(ptrdiff_t)11520, 65535-(262143>>cut9), sum149);
ptrdiff_t c20 = 0;
for (; c20 != 10; ++c20) {
__m512 wt219 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c20+(ptrdiff_t)0);
__m512 wt220 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c20+(ptrdiff_t)640);
__m512 wt221 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c20+(ptrdiff_t)1280);
__m512 wt222 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c20+(ptrdiff_t)1920);
__m512 wt223 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c20+(ptrdiff_t)2560);
__m512 wt224 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c20+(ptrdiff_t)3200);
__m512 wt225 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c20+(ptrdiff_t)3840);
__m512 wt226 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c20+(ptrdiff_t)4480);
__m512 wt227 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c20+(ptrdiff_t)5120);
__m512 wt228 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c20+(ptrdiff_t)5760);
__m512 wt229 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c20+(ptrdiff_t)6400);
__m512 wt230 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c20+(ptrdiff_t)7040);
__m512 wt231 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c20+(ptrdiff_t)7680);
__m512 wt232 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c20+(ptrdiff_t)8320);
__m512 wt233 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c20+(ptrdiff_t)8960);
__m512 wt234 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k79+64*c20+(ptrdiff_t)9600);
__m512 tmp5493 = _mm512_unpacklo_ps(wt219, wt220);
__m512 tmp5494 = _mm512_unpackhi_ps(wt219, wt220);
__m512 tmp5495 = _mm512_unpacklo_ps(wt221, wt222);
__m512 tmp5496 = _mm512_unpackhi_ps(wt221, wt222);
__m512 tmp5497 = _mm512_unpacklo_ps(wt223, wt224);
__m512 tmp5498 = _mm512_unpackhi_ps(wt223, wt224);
__m512 tmp5499 = _mm512_unpacklo_ps(wt225, wt226);
__m512 tmp5500 = _mm512_unpackhi_ps(wt225, wt226);
__m512 tmp5501 = _mm512_unpacklo_ps(wt227, wt228);
__m512 tmp5502 = _mm512_unpackhi_ps(wt227, wt228);
__m512 tmp5503 = _mm512_unpacklo_ps(wt229, wt230);
__m512 tmp5504 = _mm512_unpackhi_ps(wt229, wt230);
__m512 tmp5505 = _mm512_unpacklo_ps(wt231, wt232);
__m512 tmp5506 = _mm512_unpackhi_ps(wt231, wt232);
__m512 tmp5507 = _mm512_unpacklo_ps(wt233, wt234);
__m512 tmp5508 = _mm512_unpackhi_ps(wt233, wt234);
__m512 tmp5509 = _mm512_shuffle_ps(tmp5493, tmp5495, 68);
__m512 tmp5510 = _mm512_shuffle_ps(tmp5493, tmp5495, 238);
__m512 tmp5511 = _mm512_shuffle_ps(tmp5494, tmp5496, 68);
__m512 tmp5512 = _mm512_shuffle_ps(tmp5494, tmp5496, 238);
__m512 tmp5513 = _mm512_shuffle_ps(tmp5497, tmp5499, 68);
__m512 tmp5514 = _mm512_shuffle_ps(tmp5497, tmp5499, 238);
__m512 tmp5515 = _mm512_shuffle_ps(tmp5498, tmp5500, 68);
__m512 tmp5516 = _mm512_shuffle_ps(tmp5498, tmp5500, 238);
__m512 tmp5517 = _mm512_shuffle_ps(tmp5501, tmp5503, 68);
__m512 tmp5518 = _mm512_shuffle_ps(tmp5501, tmp5503, 238);
__m512 tmp5519 = _mm512_shuffle_ps(tmp5502, tmp5504, 68);
__m512 tmp5520 = _mm512_shuffle_ps(tmp5502, tmp5504, 238);
__m512 tmp5521 = _mm512_shuffle_ps(tmp5505, tmp5507, 68);
__m512 tmp5522 = _mm512_shuffle_ps(tmp5505, tmp5507, 238);
__m512 tmp5523 = _mm512_shuffle_ps(tmp5506, tmp5508, 68);
__m512 tmp5524 = _mm512_shuffle_ps(tmp5506, tmp5508, 238);
__m512 tmp5525 = _mm512_shuffle_f32x4(tmp5509, tmp5513, 136);
__m512 tmp5526 = _mm512_shuffle_f32x4(tmp5509, tmp5513, 221);
__m512 tmp5527 = _mm512_shuffle_f32x4(tmp5510, tmp5514, 136);
__m512 tmp5528 = _mm512_shuffle_f32x4(tmp5510, tmp5514, 221);
__m512 tmp5529 = _mm512_shuffle_f32x4(tmp5511, tmp5515, 136);
__m512 tmp5530 = _mm512_shuffle_f32x4(tmp5511, tmp5515, 221);
__m512 tmp5531 = _mm512_shuffle_f32x4(tmp5512, tmp5516, 136);
__m512 tmp5532 = _mm512_shuffle_f32x4(tmp5512, tmp5516, 221);
__m512 tmp5533 = _mm512_shuffle_f32x4(tmp5517, tmp5521, 136);
__m512 tmp5534 = _mm512_shuffle_f32x4(tmp5517, tmp5521, 221);
__m512 tmp5535 = _mm512_shuffle_f32x4(tmp5518, tmp5522, 136);
__m512 tmp5536 = _mm512_shuffle_f32x4(tmp5518, tmp5522, 221);
__m512 tmp5537 = _mm512_shuffle_f32x4(tmp5519, tmp5523, 136);
__m512 tmp5538 = _mm512_shuffle_f32x4(tmp5519, tmp5523, 221);
__m512 tmp5539 = _mm512_shuffle_f32x4(tmp5520, tmp5524, 136);
__m512 tmp5540 = _mm512_shuffle_f32x4(tmp5520, tmp5524, 221);
wt219 = _mm512_shuffle_f32x4(tmp5525, tmp5533, 136);
wt227 = _mm512_shuffle_f32x4(tmp5525, tmp5533, 221);
wt220 = _mm512_shuffle_f32x4(tmp5527, tmp5535, 136);
wt228 = _mm512_shuffle_f32x4(tmp5527, tmp5535, 221);
wt221 = _mm512_shuffle_f32x4(tmp5529, tmp5537, 136);
wt229 = _mm512_shuffle_f32x4(tmp5529, tmp5537, 221);
wt222 = _mm512_shuffle_f32x4(tmp5531, tmp5539, 136);
wt230 = _mm512_shuffle_f32x4(tmp5531, tmp5539, 221);
wt223 = _mm512_shuffle_f32x4(tmp5526, tmp5534, 136);
wt231 = _mm512_shuffle_f32x4(tmp5526, tmp5534, 221);
wt224 = _mm512_shuffle_f32x4(tmp5528, tmp5536, 136);
wt232 = _mm512_shuffle_f32x4(tmp5528, tmp5536, 221);
wt225 = _mm512_shuffle_f32x4(tmp5530, tmp5538, 136);
wt233 = _mm512_shuffle_f32x4(tmp5530, tmp5538, 221);
wt226 = _mm512_shuffle_f32x4(tmp5532, tmp5540, 136);
wt234 = _mm512_shuffle_f32x4(tmp5532, tmp5540, 221);
wt219 = _mm512_mul_ps(wt219, postMul15);
wt220 = _mm512_mul_ps(wt220, postMul15);
wt221 = _mm512_mul_ps(wt221, postMul15);
wt222 = _mm512_mul_ps(wt222, postMul15);
wt223 = _mm512_mul_ps(wt223, postMul15);
wt224 = _mm512_mul_ps(wt224, postMul15);
wt225 = _mm512_mul_ps(wt225, postMul15);
wt226 = _mm512_mul_ps(wt226, postMul15);
wt227 = _mm512_mul_ps(wt227, postMul15);
wt228 = _mm512_mul_ps(wt228, postMul15);
wt229 = _mm512_mul_ps(wt229, postMul15);
wt230 = _mm512_mul_ps(wt230, postMul15);
wt231 = _mm512_mul_ps(wt231, postMul15);
wt232 = _mm512_mul_ps(wt232, postMul15);
wt233 = _mm512_mul_ps(wt233, postMul15);
wt234 = _mm512_mul_ps(wt234, postMul15);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(1+16*c20)+(ptrdiff_t)0, 63>>cut9, wt219);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(2+16*c20)+(ptrdiff_t)0, 63>>cut9, wt220);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(3+16*c20)+(ptrdiff_t)0, 63>>cut9, wt221);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(4+16*c20)+(ptrdiff_t)0, 63>>cut9, wt222);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(5+16*c20)+(ptrdiff_t)0, 63>>cut9, wt223);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(6+16*c20)+(ptrdiff_t)0, 63>>cut9, wt224);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(7+16*c20)+(ptrdiff_t)0, 63>>cut9, wt225);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(8+16*c20)+(ptrdiff_t)0, 63>>cut9, wt226);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(9+16*c20)+(ptrdiff_t)0, 63>>cut9, wt227);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(10+16*c20)+(ptrdiff_t)0, 63>>cut9, wt228);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(11+16*c20)+(ptrdiff_t)0, 63>>cut9, wt229);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(12+16*c20)+(ptrdiff_t)0, 63>>cut9, wt230);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(13+16*c20)+(ptrdiff_t)0, 63>>cut9, wt231);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(14+16*c20)+(ptrdiff_t)0, 63>>cut9, wt232);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(15+16*c20)+(ptrdiff_t)0, 63>>cut9, wt233);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(16+16*c20)+(ptrdiff_t)0, 63>>cut9, wt234);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(1+16*c20)+(ptrdiff_t)3840, 4032>>cut9, wt219);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(2+16*c20)+(ptrdiff_t)3840, 4032>>cut9, wt220);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(3+16*c20)+(ptrdiff_t)3840, 4032>>cut9, wt221);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(4+16*c20)+(ptrdiff_t)3840, 4032>>cut9, wt222);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(5+16*c20)+(ptrdiff_t)3840, 4032>>cut9, wt223);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(6+16*c20)+(ptrdiff_t)3840, 4032>>cut9, wt224);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(7+16*c20)+(ptrdiff_t)3840, 4032>>cut9, wt225);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(8+16*c20)+(ptrdiff_t)3840, 4032>>cut9, wt226);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(9+16*c20)+(ptrdiff_t)3840, 4032>>cut9, wt227);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(10+16*c20)+(ptrdiff_t)3840, 4032>>cut9, wt228);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(11+16*c20)+(ptrdiff_t)3840, 4032>>cut9, wt229);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(12+16*c20)+(ptrdiff_t)3840, 4032>>cut9, wt230);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(13+16*c20)+(ptrdiff_t)3840, 4032>>cut9, wt231);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(14+16*c20)+(ptrdiff_t)3840, 4032>>cut9, wt232);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(15+16*c20)+(ptrdiff_t)3840, 4032>>cut9, wt233);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(16+16*c20)+(ptrdiff_t)3840, 4032>>cut9, wt234);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(1+16*c20)+(ptrdiff_t)7680, 258048>>cut9, wt219);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(2+16*c20)+(ptrdiff_t)7680, 258048>>cut9, wt220);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(3+16*c20)+(ptrdiff_t)7680, 258048>>cut9, wt221);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(4+16*c20)+(ptrdiff_t)7680, 258048>>cut9, wt222);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(5+16*c20)+(ptrdiff_t)7680, 258048>>cut9, wt223);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(6+16*c20)+(ptrdiff_t)7680, 258048>>cut9, wt224);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(7+16*c20)+(ptrdiff_t)7680, 258048>>cut9, wt225);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(8+16*c20)+(ptrdiff_t)7680, 258048>>cut9, wt226);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(9+16*c20)+(ptrdiff_t)7680, 258048>>cut9, wt227);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(10+16*c20)+(ptrdiff_t)7680, 258048>>cut9, wt228);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(11+16*c20)+(ptrdiff_t)7680, 258048>>cut9, wt229);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(12+16*c20)+(ptrdiff_t)7680, 258048>>cut9, wt230);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(13+16*c20)+(ptrdiff_t)7680, 258048>>cut9, wt231);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(14+16*c20)+(ptrdiff_t)7680, 258048>>cut9, wt232);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(15+16*c20)+(ptrdiff_t)7680, 258048>>cut9, wt233);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(16+16*c20)+(ptrdiff_t)7680, 258048>>cut9, wt234);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(1+16*c20)+(ptrdiff_t)11520, 65535-(262143>>cut9), wt219);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(2+16*c20)+(ptrdiff_t)11520, 65535-(262143>>cut9), wt220);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(3+16*c20)+(ptrdiff_t)11520, 65535-(262143>>cut9), wt221);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(4+16*c20)+(ptrdiff_t)11520, 65535-(262143>>cut9), wt222);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(5+16*c20)+(ptrdiff_t)11520, 65535-(262143>>cut9), wt223);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(6+16*c20)+(ptrdiff_t)11520, 65535-(262143>>cut9), wt224);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(7+16*c20)+(ptrdiff_t)11520, 65535-(262143>>cut9), wt225);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(8+16*c20)+(ptrdiff_t)11520, 65535-(262143>>cut9), wt226);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(9+16*c20)+(ptrdiff_t)11520, 65535-(262143>>cut9), wt227);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(10+16*c20)+(ptrdiff_t)11520, 65535-(262143>>cut9), wt228);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(11+16*c20)+(ptrdiff_t)11520, 65535-(262143>>cut9), wt229);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(12+16*c20)+(ptrdiff_t)11520, 65535-(262143>>cut9), wt230);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(13+16*c20)+(ptrdiff_t)11520, 65535-(262143>>cut9), wt231);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(14+16*c20)+(ptrdiff_t)11520, 65535-(262143>>cut9), wt232);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(15+16*c20)+(ptrdiff_t)11520, 65535-(262143>>cut9), wt233);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l27+4*cut9+24*(16+16*c20)+(ptrdiff_t)11520, 65535-(262143>>cut9), wt234);
}
}
}
} else {
ptrdiff_t k78 = 112;
ptrdiff_t l26 = (size_t)(0+k78)/6;
ptrdiff_t cut8 = (size_t)(0+k78)%6;
__m512 sum147 = _mm512_maskz_loadu_ps(65535, biasPtr6+512*i27+4*k78);
__m512i pmMul13 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd13 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo12 = _mm512_loadu_ps(bnPtr8+(ptrdiff_t)8*(k78+128*i27));
__m512 masHi12 = _mm512_maskz_loadu_ps(65535, bnPtr8+(ptrdiff_t)8*(k78+128*i27)+(ptrdiff_t)64);
__m512 postMul13 = _mm512_permutex2var_ps(masLo12, pmMul13, masHi12);
__m512 postAdd11 = _mm512_permutex2var_ps(masLo12, pmAdd13, masHi12);
sum147 = _mm512_fmadd_ps(sum147, postMul13, postAdd11);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*0+(ptrdiff_t)0, 63>>cut8, sum147);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*0+(ptrdiff_t)3840, 4032>>cut8, sum147);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*0+(ptrdiff_t)7680, 258048>>cut8, sum147);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+8*0+(ptrdiff_t)11520, 65535-(262143>>cut8), sum147);
ptrdiff_t c18 = 0;
for (; c18 != 10; ++c18) {
__m512 wt187 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k78+64*c18+(ptrdiff_t)0);
__m512 wt188 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k78+64*c18+(ptrdiff_t)640);
__m512 wt189 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k78+64*c18+(ptrdiff_t)1280);
__m512 wt190 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k78+64*c18+(ptrdiff_t)1920);
__m512 wt191 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k78+64*c18+(ptrdiff_t)2560);
__m512 wt192 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k78+64*c18+(ptrdiff_t)3200);
__m512 wt193 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k78+64*c18+(ptrdiff_t)3840);
__m512 wt194 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k78+64*c18+(ptrdiff_t)4480);
__m512 wt195 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k78+64*c18+(ptrdiff_t)5120);
__m512 wt196 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k78+64*c18+(ptrdiff_t)5760);
__m512 wt197 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k78+64*c18+(ptrdiff_t)6400);
__m512 wt198 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k78+64*c18+(ptrdiff_t)7040);
__m512 wt199 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k78+64*c18+(ptrdiff_t)7680);
__m512 wt200 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k78+64*c18+(ptrdiff_t)8320);
__m512 wt201 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k78+64*c18+(ptrdiff_t)8960);
__m512 wt202 = _mm512_maskz_loadu_ps(65535, wtPtr6+81920*i27+640*k78+64*c18+(ptrdiff_t)9600);
__m512 tmp5541 = _mm512_unpacklo_ps(wt187, wt188);
__m512 tmp5542 = _mm512_unpackhi_ps(wt187, wt188);
__m512 tmp5543 = _mm512_unpacklo_ps(wt189, wt190);
__m512 tmp5544 = _mm512_unpackhi_ps(wt189, wt190);
__m512 tmp5545 = _mm512_unpacklo_ps(wt191, wt192);
__m512 tmp5546 = _mm512_unpackhi_ps(wt191, wt192);
__m512 tmp5547 = _mm512_unpacklo_ps(wt193, wt194);
__m512 tmp5548 = _mm512_unpackhi_ps(wt193, wt194);
__m512 tmp5549 = _mm512_unpacklo_ps(wt195, wt196);
__m512 tmp5550 = _mm512_unpackhi_ps(wt195, wt196);
__m512 tmp5551 = _mm512_unpacklo_ps(wt197, wt198);
__m512 tmp5552 = _mm512_unpackhi_ps(wt197, wt198);
__m512 tmp5553 = _mm512_unpacklo_ps(wt199, wt200);
__m512 tmp5554 = _mm512_unpackhi_ps(wt199, wt200);
__m512 tmp5555 = _mm512_unpacklo_ps(wt201, wt202);
__m512 tmp5556 = _mm512_unpackhi_ps(wt201, wt202);
__m512 tmp5557 = _mm512_shuffle_ps(tmp5541, tmp5543, 68);
__m512 tmp5558 = _mm512_shuffle_ps(tmp5541, tmp5543, 238);
__m512 tmp5559 = _mm512_shuffle_ps(tmp5542, tmp5544, 68);
__m512 tmp5560 = _mm512_shuffle_ps(tmp5542, tmp5544, 238);
__m512 tmp5561 = _mm512_shuffle_ps(tmp5545, tmp5547, 68);
__m512 tmp5562 = _mm512_shuffle_ps(tmp5545, tmp5547, 238);
__m512 tmp5563 = _mm512_shuffle_ps(tmp5546, tmp5548, 68);
__m512 tmp5564 = _mm512_shuffle_ps(tmp5546, tmp5548, 238);
__m512 tmp5565 = _mm512_shuffle_ps(tmp5549, tmp5551, 68);
__m512 tmp5566 = _mm512_shuffle_ps(tmp5549, tmp5551, 238);
__m512 tmp5567 = _mm512_shuffle_ps(tmp5550, tmp5552, 68);
__m512 tmp5568 = _mm512_shuffle_ps(tmp5550, tmp5552, 238);
__m512 tmp5569 = _mm512_shuffle_ps(tmp5553, tmp5555, 68);
__m512 tmp5570 = _mm512_shuffle_ps(tmp5553, tmp5555, 238);
__m512 tmp5571 = _mm512_shuffle_ps(tmp5554, tmp5556, 68);
__m512 tmp5572 = _mm512_shuffle_ps(tmp5554, tmp5556, 238);
__m512 tmp5573 = _mm512_shuffle_f32x4(tmp5557, tmp5561, 136);
__m512 tmp5574 = _mm512_shuffle_f32x4(tmp5557, tmp5561, 221);
__m512 tmp5575 = _mm512_shuffle_f32x4(tmp5558, tmp5562, 136);
__m512 tmp5576 = _mm512_shuffle_f32x4(tmp5558, tmp5562, 221);
__m512 tmp5577 = _mm512_shuffle_f32x4(tmp5559, tmp5563, 136);
__m512 tmp5578 = _mm512_shuffle_f32x4(tmp5559, tmp5563, 221);
__m512 tmp5579 = _mm512_shuffle_f32x4(tmp5560, tmp5564, 136);
__m512 tmp5580 = _mm512_shuffle_f32x4(tmp5560, tmp5564, 221);
__m512 tmp5581 = _mm512_shuffle_f32x4(tmp5565, tmp5569, 136);
__m512 tmp5582 = _mm512_shuffle_f32x4(tmp5565, tmp5569, 221);
__m512 tmp5583 = _mm512_shuffle_f32x4(tmp5566, tmp5570, 136);
__m512 tmp5584 = _mm512_shuffle_f32x4(tmp5566, tmp5570, 221);
__m512 tmp5585 = _mm512_shuffle_f32x4(tmp5567, tmp5571, 136);
__m512 tmp5586 = _mm512_shuffle_f32x4(tmp5567, tmp5571, 221);
__m512 tmp5587 = _mm512_shuffle_f32x4(tmp5568, tmp5572, 136);
__m512 tmp5588 = _mm512_shuffle_f32x4(tmp5568, tmp5572, 221);
wt187 = _mm512_shuffle_f32x4(tmp5573, tmp5581, 136);
wt195 = _mm512_shuffle_f32x4(tmp5573, tmp5581, 221);
wt188 = _mm512_shuffle_f32x4(tmp5575, tmp5583, 136);
wt196 = _mm512_shuffle_f32x4(tmp5575, tmp5583, 221);
wt189 = _mm512_shuffle_f32x4(tmp5577, tmp5585, 136);
wt197 = _mm512_shuffle_f32x4(tmp5577, tmp5585, 221);
wt190 = _mm512_shuffle_f32x4(tmp5579, tmp5587, 136);
wt198 = _mm512_shuffle_f32x4(tmp5579, tmp5587, 221);
wt191 = _mm512_shuffle_f32x4(tmp5574, tmp5582, 136);
wt199 = _mm512_shuffle_f32x4(tmp5574, tmp5582, 221);
wt192 = _mm512_shuffle_f32x4(tmp5576, tmp5584, 136);
wt200 = _mm512_shuffle_f32x4(tmp5576, tmp5584, 221);
wt193 = _mm512_shuffle_f32x4(tmp5578, tmp5586, 136);
wt201 = _mm512_shuffle_f32x4(tmp5578, tmp5586, 221);
wt194 = _mm512_shuffle_f32x4(tmp5580, tmp5588, 136);
wt202 = _mm512_shuffle_f32x4(tmp5580, tmp5588, 221);
wt187 = _mm512_mul_ps(wt187, postMul13);
wt188 = _mm512_mul_ps(wt188, postMul13);
wt189 = _mm512_mul_ps(wt189, postMul13);
wt190 = _mm512_mul_ps(wt190, postMul13);
wt191 = _mm512_mul_ps(wt191, postMul13);
wt192 = _mm512_mul_ps(wt192, postMul13);
wt193 = _mm512_mul_ps(wt193, postMul13);
wt194 = _mm512_mul_ps(wt194, postMul13);
wt195 = _mm512_mul_ps(wt195, postMul13);
wt196 = _mm512_mul_ps(wt196, postMul13);
wt197 = _mm512_mul_ps(wt197, postMul13);
wt198 = _mm512_mul_ps(wt198, postMul13);
wt199 = _mm512_mul_ps(wt199, postMul13);
wt200 = _mm512_mul_ps(wt200, postMul13);
wt201 = _mm512_mul_ps(wt201, postMul13);
wt202 = _mm512_mul_ps(wt202, postMul13);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(1+16*c18)+(ptrdiff_t)0, 63>>cut8, wt187);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(2+16*c18)+(ptrdiff_t)0, 63>>cut8, wt188);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(3+16*c18)+(ptrdiff_t)0, 63>>cut8, wt189);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(4+16*c18)+(ptrdiff_t)0, 63>>cut8, wt190);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(5+16*c18)+(ptrdiff_t)0, 63>>cut8, wt191);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(6+16*c18)+(ptrdiff_t)0, 63>>cut8, wt192);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(7+16*c18)+(ptrdiff_t)0, 63>>cut8, wt193);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(8+16*c18)+(ptrdiff_t)0, 63>>cut8, wt194);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(9+16*c18)+(ptrdiff_t)0, 63>>cut8, wt195);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(10+16*c18)+(ptrdiff_t)0, 63>>cut8, wt196);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(11+16*c18)+(ptrdiff_t)0, 63>>cut8, wt197);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(12+16*c18)+(ptrdiff_t)0, 63>>cut8, wt198);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(13+16*c18)+(ptrdiff_t)0, 63>>cut8, wt199);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(14+16*c18)+(ptrdiff_t)0, 63>>cut8, wt200);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(15+16*c18)+(ptrdiff_t)0, 63>>cut8, wt201);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(16+16*c18)+(ptrdiff_t)0, 63>>cut8, wt202);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(1+16*c18)+(ptrdiff_t)3840, 4032>>cut8, wt187);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(2+16*c18)+(ptrdiff_t)3840, 4032>>cut8, wt188);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(3+16*c18)+(ptrdiff_t)3840, 4032>>cut8, wt189);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(4+16*c18)+(ptrdiff_t)3840, 4032>>cut8, wt190);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(5+16*c18)+(ptrdiff_t)3840, 4032>>cut8, wt191);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(6+16*c18)+(ptrdiff_t)3840, 4032>>cut8, wt192);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(7+16*c18)+(ptrdiff_t)3840, 4032>>cut8, wt193);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(8+16*c18)+(ptrdiff_t)3840, 4032>>cut8, wt194);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(9+16*c18)+(ptrdiff_t)3840, 4032>>cut8, wt195);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(10+16*c18)+(ptrdiff_t)3840, 4032>>cut8, wt196);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(11+16*c18)+(ptrdiff_t)3840, 4032>>cut8, wt197);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(12+16*c18)+(ptrdiff_t)3840, 4032>>cut8, wt198);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(13+16*c18)+(ptrdiff_t)3840, 4032>>cut8, wt199);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(14+16*c18)+(ptrdiff_t)3840, 4032>>cut8, wt200);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(15+16*c18)+(ptrdiff_t)3840, 4032>>cut8, wt201);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(16+16*c18)+(ptrdiff_t)3840, 4032>>cut8, wt202);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(1+16*c18)+(ptrdiff_t)7680, 258048>>cut8, wt187);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(2+16*c18)+(ptrdiff_t)7680, 258048>>cut8, wt188);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(3+16*c18)+(ptrdiff_t)7680, 258048>>cut8, wt189);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(4+16*c18)+(ptrdiff_t)7680, 258048>>cut8, wt190);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(5+16*c18)+(ptrdiff_t)7680, 258048>>cut8, wt191);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(6+16*c18)+(ptrdiff_t)7680, 258048>>cut8, wt192);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(7+16*c18)+(ptrdiff_t)7680, 258048>>cut8, wt193);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(8+16*c18)+(ptrdiff_t)7680, 258048>>cut8, wt194);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(9+16*c18)+(ptrdiff_t)7680, 258048>>cut8, wt195);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(10+16*c18)+(ptrdiff_t)7680, 258048>>cut8, wt196);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(11+16*c18)+(ptrdiff_t)7680, 258048>>cut8, wt197);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(12+16*c18)+(ptrdiff_t)7680, 258048>>cut8, wt198);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(13+16*c18)+(ptrdiff_t)7680, 258048>>cut8, wt199);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(14+16*c18)+(ptrdiff_t)7680, 258048>>cut8, wt200);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(15+16*c18)+(ptrdiff_t)7680, 258048>>cut8, wt201);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+24*(16+16*c18)+(ptrdiff_t)7680, 258048>>cut8, wt202);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+8*(1+16*c18)+(ptrdiff_t)11520, 65535-(262143>>cut8), wt187);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+8*(2+16*c18)+(ptrdiff_t)11520, 65535-(262143>>cut8), wt188);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+8*(3+16*c18)+(ptrdiff_t)11520, 65535-(262143>>cut8), wt189);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+8*(4+16*c18)+(ptrdiff_t)11520, 65535-(262143>>cut8), wt190);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+8*(5+16*c18)+(ptrdiff_t)11520, 65535-(262143>>cut8), wt191);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+8*(6+16*c18)+(ptrdiff_t)11520, 65535-(262143>>cut8), wt192);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+8*(7+16*c18)+(ptrdiff_t)11520, 65535-(262143>>cut8), wt193);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+8*(8+16*c18)+(ptrdiff_t)11520, 65535-(262143>>cut8), wt194);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+8*(9+16*c18)+(ptrdiff_t)11520, 65535-(262143>>cut8), wt195);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+8*(10+16*c18)+(ptrdiff_t)11520, 65535-(262143>>cut8), wt196);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+8*(11+16*c18)+(ptrdiff_t)11520, 65535-(262143>>cut8), wt197);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+8*(12+16*c18)+(ptrdiff_t)11520, 65535-(262143>>cut8), wt198);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+8*(13+16*c18)+(ptrdiff_t)11520, 65535-(262143>>cut8), wt199);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+8*(14+16*c18)+(ptrdiff_t)11520, 65535-(262143>>cut8), wt200);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+8*(15+16*c18)+(ptrdiff_t)11520, 65535-(262143>>cut8), wt201);
_mm512_mask_storeu_ps(arranged7+82432*i27+3864*l26+4*cut8+8*(16+16*c18)+(ptrdiff_t)11520, 65535-(262143>>cut8), wt202);
}
}
}
}
}

static void DenseNet121OneArrangeWts4(DenseNet121ThreaderTeam1* team32, char** tensors37) {
DenseNet121ThreaderTask1 task41;
task41.callee1 = DenseNet121OneArrangeWts4Callee1;
task41.any1 = tensors37;
task41.nd1 = 3;
task41.hull1[0] = 2;
task41.hull1[1] = 1;
task41.hull1[2] = 1;
DenseNet121ThreaderDo1(team32, &task41);
}

static void DenseNet121OneArrangeDats4Callee1(DenseNet121ThreaderTask1* task42, int64_t* pt26) {
char** tensors40 = task42->any1;
ptrdiff_t c21 = pt26[1];
char*restrict datPtr11 = tensors40[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)2017280*0;
char*restrict bnPtr9 = tensors40[1]+(ptrdiff_t)8*((ptrdiff_t)835*0+(ptrdiff_t)160*0);
char*restrict arranged8 = tensors40[2]+(ptrdiff_t)10474240*0+(ptrdiff_t)2007040*0;
ptrdiff_t ii11 = 1;
for (ptrdiff_t i28 = 0; i28 < ii11; ++i28) {
ptrdiff_t j21 = 1*c21;
ptrdiff_t jj30 = j21+0;
for (; j21 != 49; ++j21) {
ptrdiff_t k80 = 0;
ptrdiff_t kk31 = k80+160;
for (; k80 < kk31; ++k80) {
__m512 dat1295 = _mm512_maskz_loadu_ps(65535, datPtr11+2017280*i28+256*j21+12608*k80+(ptrdiff_t)0);
__m512 dat1296 = _mm512_maskz_loadu_ps(65535, datPtr11+2017280*i28+256*j21+12608*k80+(ptrdiff_t)64);
__m512 dat1297 = _mm512_maskz_loadu_ps(65535, datPtr11+2017280*i28+256*j21+12608*k80+(ptrdiff_t)128);
__m512 dat1298 = _mm512_maskz_loadu_ps(65535, datPtr11+2017280*i28+256*j21+12608*k80+(ptrdiff_t)192);
__m512 bnMul4 = _mm512_set1_ps(((float*)bnPtr9+(ptrdiff_t)2*(k80+160*i28))[0]);
__m512 bnAdd4 = _mm512_set1_ps(((float*)bnPtr9+(ptrdiff_t)2*(k80+160*i28))[1]);
dat1295 = _mm512_fmadd_ps(dat1295, bnMul4, bnAdd4);
dat1296 = _mm512_fmadd_ps(dat1296, bnMul4, bnAdd4);
dat1297 = _mm512_fmadd_ps(dat1297, bnMul4, bnAdd4);
dat1298 = _mm512_fmadd_ps(dat1298, bnMul4, bnAdd4);
dat1295 = _mm512_max_ps(_mm512_setzero_ps(), dat1295);
dat1296 = _mm512_max_ps(_mm512_setzero_ps(), dat1296);
dat1297 = _mm512_max_ps(_mm512_setzero_ps(), dat1297);
dat1298 = _mm512_max_ps(_mm512_setzero_ps(), dat1298);
_mm512_mask_storeu_ps(arranged8+2007040*i28+40960*j21+256*k80+(ptrdiff_t)0, 65535, dat1295);
_mm512_mask_storeu_ps(arranged8+2007040*i28+40960*j21+256*k80+(ptrdiff_t)64, 65535, dat1296);
_mm512_mask_storeu_ps(arranged8+2007040*i28+40960*j21+256*k80+(ptrdiff_t)128, 65535, dat1297);
_mm512_mask_storeu_ps(arranged8+2007040*i28+40960*j21+256*k80+(ptrdiff_t)192, 65535, dat1298);
}
if (j21 >= jj30) goto next4;
}
next4:;
}
}

static void DenseNet121OneArrangeDats4(DenseNet121ThreaderTeam1* team33, char** tensors39) {
DenseNet121ThreaderTask1 task43;
task43.callee1 = DenseNet121OneArrangeDats4Callee1;
task43.any1 = tensors39;
task43.nd1 = 4;
task43.hull1[0] = 1;
task43.hull1[1] = 49;
task43.hull1[2] = 1;
task43.hull1[3] = 1;
DenseNet121ThreaderDo1(team33, &task43);
}

static void DenseNet121OneApply4Callee1(DenseNet121ThreaderTask1* task44, int64_t* pt27) {
void** pair10 = task44->any1;
char** tensors42 = pair10[0];
ptrdiff_t e12 = 0;
ptrdiff_t g13 = 0;
ptrdiff_t d8 = pt27[1];
ptrdiff_t w36 = pt27[0];
char*restrict arrangedWts4 = tensors42[0]+428032*e12+(ptrdiff_t)82432*1*g13;
char*restrict arrangedDats4 = tensors42[1]+10474240*e12+(ptrdiff_t)2007040*1*g13;
char*restrict datPtr12 = tensors42[2]+(ptrdiff_t)1613824*1*g13;
ptrdiff_t ii12 = 1;
for (ptrdiff_t i29 = 0; i29 < ii12; ++i29) {
ptrdiff_t j22 = 1*d8;
ptrdiff_t jj31 = j22+0;
for (; j22 != 49; ++j22) {
ptrdiff_t k81 = 4*w36;
ptrdiff_t kk32 = k81+(w36 < 4 ? 3 : 5);
for (; k81 != 21; ++k81) {
ptrdiff_t s18 = -1;
__m512 sum150 = _mm512_set1_ps(*(float*)(arrangedWts4+82432*i29+3864*k81+24*s18+(ptrdiff_t)24));
__m512 sum154 = _mm512_set1_ps(*(float*)(arrangedWts4+82432*i29+3864*k81+24*s18+(ptrdiff_t)28));
__m512 sum158 = _mm512_set1_ps(*(float*)(arrangedWts4+82432*i29+3864*k81+24*s18+(ptrdiff_t)32));
__m512 sum162 = _mm512_set1_ps(*(float*)(arrangedWts4+82432*i29+3864*k81+24*s18+(ptrdiff_t)36));
__m512 sum166 = _mm512_set1_ps(*(float*)(arrangedWts4+82432*i29+3864*k81+24*s18+(ptrdiff_t)40));
__m512 sum170 = _mm512_set1_ps(*(float*)(arrangedWts4+82432*i29+3864*k81+24*s18+(ptrdiff_t)44));
__m512 sum151 = sum150;
__m512 sum152 = sum150;
__m512 sum153 = sum150;
__m512 sum155 = sum154;
__m512 sum156 = sum154;
__m512 sum157 = sum154;
__m512 sum159 = sum158;
__m512 sum160 = sum158;
__m512 sum161 = sum158;
__m512 sum163 = sum162;
__m512 sum164 = sum162;
__m512 sum165 = sum162;
__m512 sum167 = sum166;
__m512 sum168 = sum166;
__m512 sum169 = sum166;
__m512 sum171 = sum170;
__m512 sum172 = sum170;
__m512 sum173 = sum170;
for (s18 = 0; s18 < 160; ++s18) {
__m512 dat1299 = _mm512_loadu_ps(arrangedDats4+2007040*i29+40960*j22+256*s18+(ptrdiff_t)0);
__m512 dat1300 = _mm512_loadu_ps(arrangedDats4+2007040*i29+40960*j22+256*s18+(ptrdiff_t)64);
__m512 dat1301 = _mm512_loadu_ps(arrangedDats4+2007040*i29+40960*j22+256*s18+(ptrdiff_t)128);
__m512 dat1302 = _mm512_loadu_ps(arrangedDats4+2007040*i29+40960*j22+256*s18+(ptrdiff_t)192);
__m512 wt235 = _mm512_set1_ps(*(float*)(arrangedWts4+82432*i29+3864*k81+24*s18+(ptrdiff_t)24));
sum150 = _mm512_fmadd_ps(wt235, dat1299, sum150);
sum151 = _mm512_fmadd_ps(wt235, dat1300, sum151);
sum152 = _mm512_fmadd_ps(wt235, dat1301, sum152);
sum153 = _mm512_fmadd_ps(wt235, dat1302, sum153);
__m512 wt236 = _mm512_set1_ps(*(float*)(arrangedWts4+82432*i29+3864*k81+24*s18+(ptrdiff_t)28));
sum154 = _mm512_fmadd_ps(wt236, dat1299, sum154);
sum155 = _mm512_fmadd_ps(wt236, dat1300, sum155);
sum156 = _mm512_fmadd_ps(wt236, dat1301, sum156);
sum157 = _mm512_fmadd_ps(wt236, dat1302, sum157);
__m512 wt237 = _mm512_set1_ps(*(float*)(arrangedWts4+82432*i29+3864*k81+24*s18+(ptrdiff_t)32));
sum158 = _mm512_fmadd_ps(wt237, dat1299, sum158);
sum159 = _mm512_fmadd_ps(wt237, dat1300, sum159);
sum160 = _mm512_fmadd_ps(wt237, dat1301, sum160);
sum161 = _mm512_fmadd_ps(wt237, dat1302, sum161);
__m512 wt238 = _mm512_set1_ps(*(float*)(arrangedWts4+82432*i29+3864*k81+24*s18+(ptrdiff_t)36));
sum162 = _mm512_fmadd_ps(wt238, dat1299, sum162);
sum163 = _mm512_fmadd_ps(wt238, dat1300, sum163);
sum164 = _mm512_fmadd_ps(wt238, dat1301, sum164);
sum165 = _mm512_fmadd_ps(wt238, dat1302, sum165);
__m512 wt239 = _mm512_set1_ps(*(float*)(arrangedWts4+82432*i29+3864*k81+24*s18+(ptrdiff_t)40));
sum166 = _mm512_fmadd_ps(wt239, dat1299, sum166);
sum167 = _mm512_fmadd_ps(wt239, dat1300, sum167);
sum168 = _mm512_fmadd_ps(wt239, dat1301, sum168);
sum169 = _mm512_fmadd_ps(wt239, dat1302, sum169);
__m512 wt240 = _mm512_set1_ps(*(float*)(arrangedWts4+82432*i29+3864*k81+24*s18+(ptrdiff_t)44));
sum170 = _mm512_fmadd_ps(wt240, dat1299, sum170);
sum171 = _mm512_fmadd_ps(wt240, dat1300, sum171);
sum172 = _mm512_fmadd_ps(wt240, dat1301, sum172);
sum173 = _mm512_fmadd_ps(wt240, dat1302, sum173);
}
sum150 = _mm512_max_ps(_mm512_setzero_ps(), sum150);
sum151 = _mm512_max_ps(_mm512_setzero_ps(), sum151);
sum152 = _mm512_max_ps(_mm512_setzero_ps(), sum152);
sum153 = _mm512_max_ps(_mm512_setzero_ps(), sum153);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)0, 65535, sum150);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)64, 65535, sum151);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)128, 65535, sum152);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)192, 65535, sum153);
sum154 = _mm512_max_ps(_mm512_setzero_ps(), sum154);
sum155 = _mm512_max_ps(_mm512_setzero_ps(), sum155);
sum156 = _mm512_max_ps(_mm512_setzero_ps(), sum156);
sum157 = _mm512_max_ps(_mm512_setzero_ps(), sum157);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)12608, 65535, sum154);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)12672, 65535, sum155);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)12736, 65535, sum156);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)12800, 65535, sum157);
sum158 = _mm512_max_ps(_mm512_setzero_ps(), sum158);
sum159 = _mm512_max_ps(_mm512_setzero_ps(), sum159);
sum160 = _mm512_max_ps(_mm512_setzero_ps(), sum160);
sum161 = _mm512_max_ps(_mm512_setzero_ps(), sum161);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)25216, 65535, sum158);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)25280, 65535, sum159);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)25344, 65535, sum160);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)25408, 65535, sum161);
sum162 = _mm512_max_ps(_mm512_setzero_ps(), sum162);
sum163 = _mm512_max_ps(_mm512_setzero_ps(), sum163);
sum164 = _mm512_max_ps(_mm512_setzero_ps(), sum164);
sum165 = _mm512_max_ps(_mm512_setzero_ps(), sum165);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)37824, 65535, sum162);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)37888, 65535, sum163);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)37952, 65535, sum164);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)38016, 65535, sum165);
sum166 = _mm512_max_ps(_mm512_setzero_ps(), sum166);
sum167 = _mm512_max_ps(_mm512_setzero_ps(), sum167);
sum168 = _mm512_max_ps(_mm512_setzero_ps(), sum168);
sum169 = _mm512_max_ps(_mm512_setzero_ps(), sum169);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)50432, 65535, sum166);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)50496, 65535, sum167);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)50560, 65535, sum168);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)50624, 65535, sum169);
sum170 = _mm512_max_ps(_mm512_setzero_ps(), sum170);
sum171 = _mm512_max_ps(_mm512_setzero_ps(), sum171);
sum172 = _mm512_max_ps(_mm512_setzero_ps(), sum172);
sum173 = _mm512_max_ps(_mm512_setzero_ps(), sum173);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)63040, 65535, sum170);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)63104, 65535, sum171);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)63168, 65535, sum172);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)63232, 65535, sum173);
if (k81 >= kk32) return;
}
ptrdiff_t s19 = -1;
__m512 sum174 = _mm512_set1_ps(*(float*)(arrangedWts4+82432*i29+3864*k81+8*s19+(ptrdiff_t)8));
__m512 sum178 = _mm512_set1_ps(*(float*)(arrangedWts4+82432*i29+3864*k81+8*s19+(ptrdiff_t)12));
__m512 sum175 = sum174;
__m512 sum176 = sum174;
__m512 sum177 = sum174;
__m512 sum179 = sum178;
__m512 sum180 = sum178;
__m512 sum181 = sum178;
for (s19 = 0; s19 < 160; ++s19) {
__m512 dat1303 = _mm512_loadu_ps(arrangedDats4+2007040*i29+40960*j22+256*s19+(ptrdiff_t)0);
__m512 dat1304 = _mm512_loadu_ps(arrangedDats4+2007040*i29+40960*j22+256*s19+(ptrdiff_t)64);
__m512 dat1305 = _mm512_loadu_ps(arrangedDats4+2007040*i29+40960*j22+256*s19+(ptrdiff_t)128);
__m512 dat1306 = _mm512_loadu_ps(arrangedDats4+2007040*i29+40960*j22+256*s19+(ptrdiff_t)192);
__m512 wt241 = _mm512_set1_ps(*(float*)(arrangedWts4+82432*i29+3864*k81+8*s19+(ptrdiff_t)8));
sum174 = _mm512_fmadd_ps(wt241, dat1303, sum174);
sum175 = _mm512_fmadd_ps(wt241, dat1304, sum175);
sum176 = _mm512_fmadd_ps(wt241, dat1305, sum176);
sum177 = _mm512_fmadd_ps(wt241, dat1306, sum177);
__m512 wt242 = _mm512_set1_ps(*(float*)(arrangedWts4+82432*i29+3864*k81+8*s19+(ptrdiff_t)12));
sum178 = _mm512_fmadd_ps(wt242, dat1303, sum178);
sum179 = _mm512_fmadd_ps(wt242, dat1304, sum179);
sum180 = _mm512_fmadd_ps(wt242, dat1305, sum180);
sum181 = _mm512_fmadd_ps(wt242, dat1306, sum181);
}
sum174 = _mm512_max_ps(_mm512_setzero_ps(), sum174);
sum175 = _mm512_max_ps(_mm512_setzero_ps(), sum175);
sum176 = _mm512_max_ps(_mm512_setzero_ps(), sum176);
sum177 = _mm512_max_ps(_mm512_setzero_ps(), sum177);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)0, 65535, sum174);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)64, 65535, sum175);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)128, 65535, sum176);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)192, 65535, sum177);
sum178 = _mm512_max_ps(_mm512_setzero_ps(), sum178);
sum179 = _mm512_max_ps(_mm512_setzero_ps(), sum179);
sum180 = _mm512_max_ps(_mm512_setzero_ps(), sum180);
sum181 = _mm512_max_ps(_mm512_setzero_ps(), sum181);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)12608, 65535, sum178);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)12672, 65535, sum179);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)12736, 65535, sum180);
_mm512_mask_storeu_ps(datPtr12+1613824*i29+256*j22+75648*k81+(ptrdiff_t)12800, 65535, sum181);
if (j22 >= jj31) return;
}
}
}

static void DenseNet121OneApply4(DenseNet121ThreaderTeam1* team34, char** tensors41) {
void* pair9[] = {tensors41, 0};
DenseNet121ThreaderTask1 task45;
task45.callee1 = DenseNet121OneApply4Callee1;
task45.any1 = pair9;
task45.nd1 = 3;
task45.hull1[0] = 5;
task45.hull1[1] = 49;
task45.hull1[2] = 1;
DenseNet121ThreaderDo1(team34, &task45);
}

static void DenseNet121OneArrangeWts5Callee1(DenseNet121ThreaderTask1* task46, int64_t* pt28) {
char** tensors44 = task46->any1;
ptrdiff_t b49 = pt28[0];
char*restrict wtPtr7 = tensors44[0]+(ptrdiff_t)3340*0+(ptrdiff_t)98304*0;
char*restrict biasPtr7 = tensors44[1]+(ptrdiff_t)512*0;
char*restrict bnPtr10 = tensors44[2]+(ptrdiff_t)8*128*0;
char*restrict arranged9 = tensors44[3]+(ptrdiff_t)428032*0+(ptrdiff_t)98816*0;
ptrdiff_t ii13 = 1;
for (ptrdiff_t i31 = 0; i31 < ii13; ++i31) {
ptrdiff_t j23 = 4*b49;
ptrdiff_t jj32 = j23+4;
for (; j23 < jj32; ++j23) {
if (j23 < 7) {
ptrdiff_t k83 = 0+16*(j23-0);
ptrdiff_t l29 = (size_t)(0+k83)/6;
ptrdiff_t cut11 = (size_t)(0+k83)%6;
switch (cut11) {
case 0:;
case 2: {
__m512 sum183 = _mm512_maskz_loadu_ps(65535, biasPtr7+512*i31+4*k83);
__m512i pmMul14 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd14 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo13 = _mm512_loadu_ps(bnPtr10+(ptrdiff_t)8*(k83+128*i31));
__m512 masHi13 = _mm512_maskz_loadu_ps(65535, bnPtr10+(ptrdiff_t)8*(k83+128*i31)+(ptrdiff_t)64);
__m512 postMul17 = _mm512_permutex2var_ps(masLo13, pmMul14, masHi13);
__m512 postAdd15 = _mm512_permutex2var_ps(masLo13, pmAdd14, masHi13);
sum183 = _mm512_fmadd_ps(sum183, postMul17, postAdd15);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*0+(ptrdiff_t)0, 63>>cut11, sum183);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*0+(ptrdiff_t)4608, 4032>>cut11, sum183);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*0+(ptrdiff_t)9216, 65535-(4095>>cut11), sum183);
ptrdiff_t c23 = 0;
for (; c23 != 12; ++c23) {
__m512 wt259 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c23+(ptrdiff_t)0);
__m512 wt260 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c23+(ptrdiff_t)768);
__m512 wt261 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c23+(ptrdiff_t)1536);
__m512 wt262 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c23+(ptrdiff_t)2304);
__m512 wt263 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c23+(ptrdiff_t)3072);
__m512 wt264 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c23+(ptrdiff_t)3840);
__m512 wt265 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c23+(ptrdiff_t)4608);
__m512 wt266 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c23+(ptrdiff_t)5376);
__m512 wt267 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c23+(ptrdiff_t)6144);
__m512 wt268 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c23+(ptrdiff_t)6912);
__m512 wt269 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c23+(ptrdiff_t)7680);
__m512 wt270 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c23+(ptrdiff_t)8448);
__m512 wt271 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c23+(ptrdiff_t)9216);
__m512 wt272 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c23+(ptrdiff_t)9984);
__m512 wt273 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c23+(ptrdiff_t)10752);
__m512 wt274 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c23+(ptrdiff_t)11520);
__m512 tmp5589 = _mm512_unpacklo_ps(wt259, wt260);
__m512 tmp5590 = _mm512_unpackhi_ps(wt259, wt260);
__m512 tmp5591 = _mm512_unpacklo_ps(wt261, wt262);
__m512 tmp5592 = _mm512_unpackhi_ps(wt261, wt262);
__m512 tmp5593 = _mm512_unpacklo_ps(wt263, wt264);
__m512 tmp5594 = _mm512_unpackhi_ps(wt263, wt264);
__m512 tmp5595 = _mm512_unpacklo_ps(wt265, wt266);
__m512 tmp5596 = _mm512_unpackhi_ps(wt265, wt266);
__m512 tmp5597 = _mm512_unpacklo_ps(wt267, wt268);
__m512 tmp5598 = _mm512_unpackhi_ps(wt267, wt268);
__m512 tmp5599 = _mm512_unpacklo_ps(wt269, wt270);
__m512 tmp5600 = _mm512_unpackhi_ps(wt269, wt270);
__m512 tmp5601 = _mm512_unpacklo_ps(wt271, wt272);
__m512 tmp5602 = _mm512_unpackhi_ps(wt271, wt272);
__m512 tmp5603 = _mm512_unpacklo_ps(wt273, wt274);
__m512 tmp5604 = _mm512_unpackhi_ps(wt273, wt274);
__m512 tmp5605 = _mm512_shuffle_ps(tmp5589, tmp5591, 68);
__m512 tmp5606 = _mm512_shuffle_ps(tmp5589, tmp5591, 238);
__m512 tmp5607 = _mm512_shuffle_ps(tmp5590, tmp5592, 68);
__m512 tmp5608 = _mm512_shuffle_ps(tmp5590, tmp5592, 238);
__m512 tmp5609 = _mm512_shuffle_ps(tmp5593, tmp5595, 68);
__m512 tmp5610 = _mm512_shuffle_ps(tmp5593, tmp5595, 238);
__m512 tmp5611 = _mm512_shuffle_ps(tmp5594, tmp5596, 68);
__m512 tmp5612 = _mm512_shuffle_ps(tmp5594, tmp5596, 238);
__m512 tmp5613 = _mm512_shuffle_ps(tmp5597, tmp5599, 68);
__m512 tmp5614 = _mm512_shuffle_ps(tmp5597, tmp5599, 238);
__m512 tmp5615 = _mm512_shuffle_ps(tmp5598, tmp5600, 68);
__m512 tmp5616 = _mm512_shuffle_ps(tmp5598, tmp5600, 238);
__m512 tmp5617 = _mm512_shuffle_ps(tmp5601, tmp5603, 68);
__m512 tmp5618 = _mm512_shuffle_ps(tmp5601, tmp5603, 238);
__m512 tmp5619 = _mm512_shuffle_ps(tmp5602, tmp5604, 68);
__m512 tmp5620 = _mm512_shuffle_ps(tmp5602, tmp5604, 238);
__m512 tmp5621 = _mm512_shuffle_f32x4(tmp5605, tmp5609, 136);
__m512 tmp5622 = _mm512_shuffle_f32x4(tmp5605, tmp5609, 221);
__m512 tmp5623 = _mm512_shuffle_f32x4(tmp5606, tmp5610, 136);
__m512 tmp5624 = _mm512_shuffle_f32x4(tmp5606, tmp5610, 221);
__m512 tmp5625 = _mm512_shuffle_f32x4(tmp5607, tmp5611, 136);
__m512 tmp5626 = _mm512_shuffle_f32x4(tmp5607, tmp5611, 221);
__m512 tmp5627 = _mm512_shuffle_f32x4(tmp5608, tmp5612, 136);
__m512 tmp5628 = _mm512_shuffle_f32x4(tmp5608, tmp5612, 221);
__m512 tmp5629 = _mm512_shuffle_f32x4(tmp5613, tmp5617, 136);
__m512 tmp5630 = _mm512_shuffle_f32x4(tmp5613, tmp5617, 221);
__m512 tmp5631 = _mm512_shuffle_f32x4(tmp5614, tmp5618, 136);
__m512 tmp5632 = _mm512_shuffle_f32x4(tmp5614, tmp5618, 221);
__m512 tmp5633 = _mm512_shuffle_f32x4(tmp5615, tmp5619, 136);
__m512 tmp5634 = _mm512_shuffle_f32x4(tmp5615, tmp5619, 221);
__m512 tmp5635 = _mm512_shuffle_f32x4(tmp5616, tmp5620, 136);
__m512 tmp5636 = _mm512_shuffle_f32x4(tmp5616, tmp5620, 221);
wt259 = _mm512_shuffle_f32x4(tmp5621, tmp5629, 136);
wt267 = _mm512_shuffle_f32x4(tmp5621, tmp5629, 221);
wt260 = _mm512_shuffle_f32x4(tmp5623, tmp5631, 136);
wt268 = _mm512_shuffle_f32x4(tmp5623, tmp5631, 221);
wt261 = _mm512_shuffle_f32x4(tmp5625, tmp5633, 136);
wt269 = _mm512_shuffle_f32x4(tmp5625, tmp5633, 221);
wt262 = _mm512_shuffle_f32x4(tmp5627, tmp5635, 136);
wt270 = _mm512_shuffle_f32x4(tmp5627, tmp5635, 221);
wt263 = _mm512_shuffle_f32x4(tmp5622, tmp5630, 136);
wt271 = _mm512_shuffle_f32x4(tmp5622, tmp5630, 221);
wt264 = _mm512_shuffle_f32x4(tmp5624, tmp5632, 136);
wt272 = _mm512_shuffle_f32x4(tmp5624, tmp5632, 221);
wt265 = _mm512_shuffle_f32x4(tmp5626, tmp5634, 136);
wt273 = _mm512_shuffle_f32x4(tmp5626, tmp5634, 221);
wt266 = _mm512_shuffle_f32x4(tmp5628, tmp5636, 136);
wt274 = _mm512_shuffle_f32x4(tmp5628, tmp5636, 221);
wt259 = _mm512_mul_ps(wt259, postMul17);
wt260 = _mm512_mul_ps(wt260, postMul17);
wt261 = _mm512_mul_ps(wt261, postMul17);
wt262 = _mm512_mul_ps(wt262, postMul17);
wt263 = _mm512_mul_ps(wt263, postMul17);
wt264 = _mm512_mul_ps(wt264, postMul17);
wt265 = _mm512_mul_ps(wt265, postMul17);
wt266 = _mm512_mul_ps(wt266, postMul17);
wt267 = _mm512_mul_ps(wt267, postMul17);
wt268 = _mm512_mul_ps(wt268, postMul17);
wt269 = _mm512_mul_ps(wt269, postMul17);
wt270 = _mm512_mul_ps(wt270, postMul17);
wt271 = _mm512_mul_ps(wt271, postMul17);
wt272 = _mm512_mul_ps(wt272, postMul17);
wt273 = _mm512_mul_ps(wt273, postMul17);
wt274 = _mm512_mul_ps(wt274, postMul17);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(1+16*c23)+(ptrdiff_t)0, 63>>cut11, wt259);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(2+16*c23)+(ptrdiff_t)0, 63>>cut11, wt260);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(3+16*c23)+(ptrdiff_t)0, 63>>cut11, wt261);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(4+16*c23)+(ptrdiff_t)0, 63>>cut11, wt262);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(5+16*c23)+(ptrdiff_t)0, 63>>cut11, wt263);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(6+16*c23)+(ptrdiff_t)0, 63>>cut11, wt264);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(7+16*c23)+(ptrdiff_t)0, 63>>cut11, wt265);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(8+16*c23)+(ptrdiff_t)0, 63>>cut11, wt266);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(9+16*c23)+(ptrdiff_t)0, 63>>cut11, wt267);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(10+16*c23)+(ptrdiff_t)0, 63>>cut11, wt268);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(11+16*c23)+(ptrdiff_t)0, 63>>cut11, wt269);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(12+16*c23)+(ptrdiff_t)0, 63>>cut11, wt270);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(13+16*c23)+(ptrdiff_t)0, 63>>cut11, wt271);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(14+16*c23)+(ptrdiff_t)0, 63>>cut11, wt272);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(15+16*c23)+(ptrdiff_t)0, 63>>cut11, wt273);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(16+16*c23)+(ptrdiff_t)0, 63>>cut11, wt274);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(1+16*c23)+(ptrdiff_t)4608, 4032>>cut11, wt259);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(2+16*c23)+(ptrdiff_t)4608, 4032>>cut11, wt260);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(3+16*c23)+(ptrdiff_t)4608, 4032>>cut11, wt261);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(4+16*c23)+(ptrdiff_t)4608, 4032>>cut11, wt262);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(5+16*c23)+(ptrdiff_t)4608, 4032>>cut11, wt263);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(6+16*c23)+(ptrdiff_t)4608, 4032>>cut11, wt264);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(7+16*c23)+(ptrdiff_t)4608, 4032>>cut11, wt265);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(8+16*c23)+(ptrdiff_t)4608, 4032>>cut11, wt266);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(9+16*c23)+(ptrdiff_t)4608, 4032>>cut11, wt267);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(10+16*c23)+(ptrdiff_t)4608, 4032>>cut11, wt268);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(11+16*c23)+(ptrdiff_t)4608, 4032>>cut11, wt269);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(12+16*c23)+(ptrdiff_t)4608, 4032>>cut11, wt270);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(13+16*c23)+(ptrdiff_t)4608, 4032>>cut11, wt271);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(14+16*c23)+(ptrdiff_t)4608, 4032>>cut11, wt272);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(15+16*c23)+(ptrdiff_t)4608, 4032>>cut11, wt273);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(16+16*c23)+(ptrdiff_t)4608, 4032>>cut11, wt274);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(1+16*c23)+(ptrdiff_t)9216, 65535-(4095>>cut11), wt259);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(2+16*c23)+(ptrdiff_t)9216, 65535-(4095>>cut11), wt260);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(3+16*c23)+(ptrdiff_t)9216, 65535-(4095>>cut11), wt261);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(4+16*c23)+(ptrdiff_t)9216, 65535-(4095>>cut11), wt262);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(5+16*c23)+(ptrdiff_t)9216, 65535-(4095>>cut11), wt263);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(6+16*c23)+(ptrdiff_t)9216, 65535-(4095>>cut11), wt264);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(7+16*c23)+(ptrdiff_t)9216, 65535-(4095>>cut11), wt265);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(8+16*c23)+(ptrdiff_t)9216, 65535-(4095>>cut11), wt266);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(9+16*c23)+(ptrdiff_t)9216, 65535-(4095>>cut11), wt267);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(10+16*c23)+(ptrdiff_t)9216, 65535-(4095>>cut11), wt268);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(11+16*c23)+(ptrdiff_t)9216, 65535-(4095>>cut11), wt269);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(12+16*c23)+(ptrdiff_t)9216, 65535-(4095>>cut11), wt270);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(13+16*c23)+(ptrdiff_t)9216, 65535-(4095>>cut11), wt271);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(14+16*c23)+(ptrdiff_t)9216, 65535-(4095>>cut11), wt272);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(15+16*c23)+(ptrdiff_t)9216, 65535-(4095>>cut11), wt273);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(16+16*c23)+(ptrdiff_t)9216, 65535-(4095>>cut11), wt274);
}
break;
}
default: {
cut11 = 4;
__m512 sum184 = _mm512_maskz_loadu_ps(65535, biasPtr7+512*i31+4*k83);
__m512i pmMul15 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd15 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo14 = _mm512_loadu_ps(bnPtr10+(ptrdiff_t)8*(k83+128*i31));
__m512 masHi14 = _mm512_maskz_loadu_ps(65535, bnPtr10+(ptrdiff_t)8*(k83+128*i31)+(ptrdiff_t)64);
__m512 postMul18 = _mm512_permutex2var_ps(masLo14, pmMul15, masHi14);
__m512 postAdd16 = _mm512_permutex2var_ps(masLo14, pmAdd15, masHi14);
sum184 = _mm512_fmadd_ps(sum184, postMul18, postAdd16);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*0+(ptrdiff_t)0, 63>>cut11, sum184);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*0+(ptrdiff_t)4608, 4032>>cut11, sum184);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*0+(ptrdiff_t)9216, 258048>>cut11, sum184);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*0+(ptrdiff_t)13824, 65535-(262143>>cut11), sum184);
ptrdiff_t c24 = 0;
for (; c24 != 12; ++c24) {
__m512 wt275 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c24+(ptrdiff_t)0);
__m512 wt276 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c24+(ptrdiff_t)768);
__m512 wt277 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c24+(ptrdiff_t)1536);
__m512 wt278 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c24+(ptrdiff_t)2304);
__m512 wt279 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c24+(ptrdiff_t)3072);
__m512 wt280 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c24+(ptrdiff_t)3840);
__m512 wt281 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c24+(ptrdiff_t)4608);
__m512 wt282 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c24+(ptrdiff_t)5376);
__m512 wt283 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c24+(ptrdiff_t)6144);
__m512 wt284 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c24+(ptrdiff_t)6912);
__m512 wt285 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c24+(ptrdiff_t)7680);
__m512 wt286 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c24+(ptrdiff_t)8448);
__m512 wt287 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c24+(ptrdiff_t)9216);
__m512 wt288 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c24+(ptrdiff_t)9984);
__m512 wt289 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c24+(ptrdiff_t)10752);
__m512 wt290 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k83+64*c24+(ptrdiff_t)11520);
__m512 tmp5637 = _mm512_unpacklo_ps(wt275, wt276);
__m512 tmp5638 = _mm512_unpackhi_ps(wt275, wt276);
__m512 tmp5639 = _mm512_unpacklo_ps(wt277, wt278);
__m512 tmp5640 = _mm512_unpackhi_ps(wt277, wt278);
__m512 tmp5641 = _mm512_unpacklo_ps(wt279, wt280);
__m512 tmp5642 = _mm512_unpackhi_ps(wt279, wt280);
__m512 tmp5643 = _mm512_unpacklo_ps(wt281, wt282);
__m512 tmp5644 = _mm512_unpackhi_ps(wt281, wt282);
__m512 tmp5645 = _mm512_unpacklo_ps(wt283, wt284);
__m512 tmp5646 = _mm512_unpackhi_ps(wt283, wt284);
__m512 tmp5647 = _mm512_unpacklo_ps(wt285, wt286);
__m512 tmp5648 = _mm512_unpackhi_ps(wt285, wt286);
__m512 tmp5649 = _mm512_unpacklo_ps(wt287, wt288);
__m512 tmp5650 = _mm512_unpackhi_ps(wt287, wt288);
__m512 tmp5651 = _mm512_unpacklo_ps(wt289, wt290);
__m512 tmp5652 = _mm512_unpackhi_ps(wt289, wt290);
__m512 tmp5653 = _mm512_shuffle_ps(tmp5637, tmp5639, 68);
__m512 tmp5654 = _mm512_shuffle_ps(tmp5637, tmp5639, 238);
__m512 tmp5655 = _mm512_shuffle_ps(tmp5638, tmp5640, 68);
__m512 tmp5656 = _mm512_shuffle_ps(tmp5638, tmp5640, 238);
__m512 tmp5657 = _mm512_shuffle_ps(tmp5641, tmp5643, 68);
__m512 tmp5658 = _mm512_shuffle_ps(tmp5641, tmp5643, 238);
__m512 tmp5659 = _mm512_shuffle_ps(tmp5642, tmp5644, 68);
__m512 tmp5660 = _mm512_shuffle_ps(tmp5642, tmp5644, 238);
__m512 tmp5661 = _mm512_shuffle_ps(tmp5645, tmp5647, 68);
__m512 tmp5662 = _mm512_shuffle_ps(tmp5645, tmp5647, 238);
__m512 tmp5663 = _mm512_shuffle_ps(tmp5646, tmp5648, 68);
__m512 tmp5664 = _mm512_shuffle_ps(tmp5646, tmp5648, 238);
__m512 tmp5665 = _mm512_shuffle_ps(tmp5649, tmp5651, 68);
__m512 tmp5666 = _mm512_shuffle_ps(tmp5649, tmp5651, 238);
__m512 tmp5667 = _mm512_shuffle_ps(tmp5650, tmp5652, 68);
__m512 tmp5668 = _mm512_shuffle_ps(tmp5650, tmp5652, 238);
__m512 tmp5669 = _mm512_shuffle_f32x4(tmp5653, tmp5657, 136);
__m512 tmp5670 = _mm512_shuffle_f32x4(tmp5653, tmp5657, 221);
__m512 tmp5671 = _mm512_shuffle_f32x4(tmp5654, tmp5658, 136);
__m512 tmp5672 = _mm512_shuffle_f32x4(tmp5654, tmp5658, 221);
__m512 tmp5673 = _mm512_shuffle_f32x4(tmp5655, tmp5659, 136);
__m512 tmp5674 = _mm512_shuffle_f32x4(tmp5655, tmp5659, 221);
__m512 tmp5675 = _mm512_shuffle_f32x4(tmp5656, tmp5660, 136);
__m512 tmp5676 = _mm512_shuffle_f32x4(tmp5656, tmp5660, 221);
__m512 tmp5677 = _mm512_shuffle_f32x4(tmp5661, tmp5665, 136);
__m512 tmp5678 = _mm512_shuffle_f32x4(tmp5661, tmp5665, 221);
__m512 tmp5679 = _mm512_shuffle_f32x4(tmp5662, tmp5666, 136);
__m512 tmp5680 = _mm512_shuffle_f32x4(tmp5662, tmp5666, 221);
__m512 tmp5681 = _mm512_shuffle_f32x4(tmp5663, tmp5667, 136);
__m512 tmp5682 = _mm512_shuffle_f32x4(tmp5663, tmp5667, 221);
__m512 tmp5683 = _mm512_shuffle_f32x4(tmp5664, tmp5668, 136);
__m512 tmp5684 = _mm512_shuffle_f32x4(tmp5664, tmp5668, 221);
wt275 = _mm512_shuffle_f32x4(tmp5669, tmp5677, 136);
wt283 = _mm512_shuffle_f32x4(tmp5669, tmp5677, 221);
wt276 = _mm512_shuffle_f32x4(tmp5671, tmp5679, 136);
wt284 = _mm512_shuffle_f32x4(tmp5671, tmp5679, 221);
wt277 = _mm512_shuffle_f32x4(tmp5673, tmp5681, 136);
wt285 = _mm512_shuffle_f32x4(tmp5673, tmp5681, 221);
wt278 = _mm512_shuffle_f32x4(tmp5675, tmp5683, 136);
wt286 = _mm512_shuffle_f32x4(tmp5675, tmp5683, 221);
wt279 = _mm512_shuffle_f32x4(tmp5670, tmp5678, 136);
wt287 = _mm512_shuffle_f32x4(tmp5670, tmp5678, 221);
wt280 = _mm512_shuffle_f32x4(tmp5672, tmp5680, 136);
wt288 = _mm512_shuffle_f32x4(tmp5672, tmp5680, 221);
wt281 = _mm512_shuffle_f32x4(tmp5674, tmp5682, 136);
wt289 = _mm512_shuffle_f32x4(tmp5674, tmp5682, 221);
wt282 = _mm512_shuffle_f32x4(tmp5676, tmp5684, 136);
wt290 = _mm512_shuffle_f32x4(tmp5676, tmp5684, 221);
wt275 = _mm512_mul_ps(wt275, postMul18);
wt276 = _mm512_mul_ps(wt276, postMul18);
wt277 = _mm512_mul_ps(wt277, postMul18);
wt278 = _mm512_mul_ps(wt278, postMul18);
wt279 = _mm512_mul_ps(wt279, postMul18);
wt280 = _mm512_mul_ps(wt280, postMul18);
wt281 = _mm512_mul_ps(wt281, postMul18);
wt282 = _mm512_mul_ps(wt282, postMul18);
wt283 = _mm512_mul_ps(wt283, postMul18);
wt284 = _mm512_mul_ps(wt284, postMul18);
wt285 = _mm512_mul_ps(wt285, postMul18);
wt286 = _mm512_mul_ps(wt286, postMul18);
wt287 = _mm512_mul_ps(wt287, postMul18);
wt288 = _mm512_mul_ps(wt288, postMul18);
wt289 = _mm512_mul_ps(wt289, postMul18);
wt290 = _mm512_mul_ps(wt290, postMul18);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(1+16*c24)+(ptrdiff_t)0, 63>>cut11, wt275);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(2+16*c24)+(ptrdiff_t)0, 63>>cut11, wt276);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(3+16*c24)+(ptrdiff_t)0, 63>>cut11, wt277);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(4+16*c24)+(ptrdiff_t)0, 63>>cut11, wt278);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(5+16*c24)+(ptrdiff_t)0, 63>>cut11, wt279);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(6+16*c24)+(ptrdiff_t)0, 63>>cut11, wt280);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(7+16*c24)+(ptrdiff_t)0, 63>>cut11, wt281);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(8+16*c24)+(ptrdiff_t)0, 63>>cut11, wt282);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(9+16*c24)+(ptrdiff_t)0, 63>>cut11, wt283);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(10+16*c24)+(ptrdiff_t)0, 63>>cut11, wt284);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(11+16*c24)+(ptrdiff_t)0, 63>>cut11, wt285);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(12+16*c24)+(ptrdiff_t)0, 63>>cut11, wt286);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(13+16*c24)+(ptrdiff_t)0, 63>>cut11, wt287);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(14+16*c24)+(ptrdiff_t)0, 63>>cut11, wt288);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(15+16*c24)+(ptrdiff_t)0, 63>>cut11, wt289);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(16+16*c24)+(ptrdiff_t)0, 63>>cut11, wt290);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(1+16*c24)+(ptrdiff_t)4608, 4032>>cut11, wt275);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(2+16*c24)+(ptrdiff_t)4608, 4032>>cut11, wt276);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(3+16*c24)+(ptrdiff_t)4608, 4032>>cut11, wt277);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(4+16*c24)+(ptrdiff_t)4608, 4032>>cut11, wt278);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(5+16*c24)+(ptrdiff_t)4608, 4032>>cut11, wt279);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(6+16*c24)+(ptrdiff_t)4608, 4032>>cut11, wt280);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(7+16*c24)+(ptrdiff_t)4608, 4032>>cut11, wt281);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(8+16*c24)+(ptrdiff_t)4608, 4032>>cut11, wt282);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(9+16*c24)+(ptrdiff_t)4608, 4032>>cut11, wt283);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(10+16*c24)+(ptrdiff_t)4608, 4032>>cut11, wt284);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(11+16*c24)+(ptrdiff_t)4608, 4032>>cut11, wt285);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(12+16*c24)+(ptrdiff_t)4608, 4032>>cut11, wt286);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(13+16*c24)+(ptrdiff_t)4608, 4032>>cut11, wt287);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(14+16*c24)+(ptrdiff_t)4608, 4032>>cut11, wt288);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(15+16*c24)+(ptrdiff_t)4608, 4032>>cut11, wt289);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(16+16*c24)+(ptrdiff_t)4608, 4032>>cut11, wt290);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(1+16*c24)+(ptrdiff_t)9216, 258048>>cut11, wt275);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(2+16*c24)+(ptrdiff_t)9216, 258048>>cut11, wt276);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(3+16*c24)+(ptrdiff_t)9216, 258048>>cut11, wt277);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(4+16*c24)+(ptrdiff_t)9216, 258048>>cut11, wt278);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(5+16*c24)+(ptrdiff_t)9216, 258048>>cut11, wt279);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(6+16*c24)+(ptrdiff_t)9216, 258048>>cut11, wt280);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(7+16*c24)+(ptrdiff_t)9216, 258048>>cut11, wt281);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(8+16*c24)+(ptrdiff_t)9216, 258048>>cut11, wt282);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(9+16*c24)+(ptrdiff_t)9216, 258048>>cut11, wt283);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(10+16*c24)+(ptrdiff_t)9216, 258048>>cut11, wt284);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(11+16*c24)+(ptrdiff_t)9216, 258048>>cut11, wt285);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(12+16*c24)+(ptrdiff_t)9216, 258048>>cut11, wt286);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(13+16*c24)+(ptrdiff_t)9216, 258048>>cut11, wt287);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(14+16*c24)+(ptrdiff_t)9216, 258048>>cut11, wt288);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(15+16*c24)+(ptrdiff_t)9216, 258048>>cut11, wt289);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(16+16*c24)+(ptrdiff_t)9216, 258048>>cut11, wt290);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(1+16*c24)+(ptrdiff_t)13824, 65535-(262143>>cut11), wt275);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(2+16*c24)+(ptrdiff_t)13824, 65535-(262143>>cut11), wt276);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(3+16*c24)+(ptrdiff_t)13824, 65535-(262143>>cut11), wt277);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(4+16*c24)+(ptrdiff_t)13824, 65535-(262143>>cut11), wt278);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(5+16*c24)+(ptrdiff_t)13824, 65535-(262143>>cut11), wt279);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(6+16*c24)+(ptrdiff_t)13824, 65535-(262143>>cut11), wt280);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(7+16*c24)+(ptrdiff_t)13824, 65535-(262143>>cut11), wt281);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(8+16*c24)+(ptrdiff_t)13824, 65535-(262143>>cut11), wt282);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(9+16*c24)+(ptrdiff_t)13824, 65535-(262143>>cut11), wt283);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(10+16*c24)+(ptrdiff_t)13824, 65535-(262143>>cut11), wt284);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(11+16*c24)+(ptrdiff_t)13824, 65535-(262143>>cut11), wt285);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(12+16*c24)+(ptrdiff_t)13824, 65535-(262143>>cut11), wt286);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(13+16*c24)+(ptrdiff_t)13824, 65535-(262143>>cut11), wt287);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(14+16*c24)+(ptrdiff_t)13824, 65535-(262143>>cut11), wt288);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(15+16*c24)+(ptrdiff_t)13824, 65535-(262143>>cut11), wt289);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l29+4*cut11+24*(16+16*c24)+(ptrdiff_t)13824, 65535-(262143>>cut11), wt290);
}
}
}
} else {
ptrdiff_t k82 = 112;
ptrdiff_t l28 = (size_t)(0+k82)/6;
ptrdiff_t cut10 = (size_t)(0+k82)%6;
__m512 sum182 = _mm512_maskz_loadu_ps(65535, biasPtr7+512*i31+4*k82);
__m512i pmMul16 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd16 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo15 = _mm512_loadu_ps(bnPtr10+(ptrdiff_t)8*(k82+128*i31));
__m512 masHi15 = _mm512_maskz_loadu_ps(65535, bnPtr10+(ptrdiff_t)8*(k82+128*i31)+(ptrdiff_t)64);
__m512 postMul16 = _mm512_permutex2var_ps(masLo15, pmMul16, masHi15);
__m512 postAdd14 = _mm512_permutex2var_ps(masLo15, pmAdd16, masHi15);
sum182 = _mm512_fmadd_ps(sum182, postMul16, postAdd14);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*0+(ptrdiff_t)0, 63>>cut10, sum182);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*0+(ptrdiff_t)4608, 4032>>cut10, sum182);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*0+(ptrdiff_t)9216, 258048>>cut10, sum182);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+8*0+(ptrdiff_t)13824, 65535-(262143>>cut10), sum182);
ptrdiff_t c22 = 0;
for (; c22 != 12; ++c22) {
__m512 wt243 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k82+64*c22+(ptrdiff_t)0);
__m512 wt244 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k82+64*c22+(ptrdiff_t)768);
__m512 wt245 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k82+64*c22+(ptrdiff_t)1536);
__m512 wt246 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k82+64*c22+(ptrdiff_t)2304);
__m512 wt247 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k82+64*c22+(ptrdiff_t)3072);
__m512 wt248 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k82+64*c22+(ptrdiff_t)3840);
__m512 wt249 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k82+64*c22+(ptrdiff_t)4608);
__m512 wt250 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k82+64*c22+(ptrdiff_t)5376);
__m512 wt251 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k82+64*c22+(ptrdiff_t)6144);
__m512 wt252 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k82+64*c22+(ptrdiff_t)6912);
__m512 wt253 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k82+64*c22+(ptrdiff_t)7680);
__m512 wt254 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k82+64*c22+(ptrdiff_t)8448);
__m512 wt255 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k82+64*c22+(ptrdiff_t)9216);
__m512 wt256 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k82+64*c22+(ptrdiff_t)9984);
__m512 wt257 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k82+64*c22+(ptrdiff_t)10752);
__m512 wt258 = _mm512_maskz_loadu_ps(65535, wtPtr7+98304*i31+768*k82+64*c22+(ptrdiff_t)11520);
__m512 tmp5685 = _mm512_unpacklo_ps(wt243, wt244);
__m512 tmp5686 = _mm512_unpackhi_ps(wt243, wt244);
__m512 tmp5687 = _mm512_unpacklo_ps(wt245, wt246);
__m512 tmp5688 = _mm512_unpackhi_ps(wt245, wt246);
__m512 tmp5689 = _mm512_unpacklo_ps(wt247, wt248);
__m512 tmp5690 = _mm512_unpackhi_ps(wt247, wt248);
__m512 tmp5691 = _mm512_unpacklo_ps(wt249, wt250);
__m512 tmp5692 = _mm512_unpackhi_ps(wt249, wt250);
__m512 tmp5693 = _mm512_unpacklo_ps(wt251, wt252);
__m512 tmp5694 = _mm512_unpackhi_ps(wt251, wt252);
__m512 tmp5695 = _mm512_unpacklo_ps(wt253, wt254);
__m512 tmp5696 = _mm512_unpackhi_ps(wt253, wt254);
__m512 tmp5697 = _mm512_unpacklo_ps(wt255, wt256);
__m512 tmp5698 = _mm512_unpackhi_ps(wt255, wt256);
__m512 tmp5699 = _mm512_unpacklo_ps(wt257, wt258);
__m512 tmp5700 = _mm512_unpackhi_ps(wt257, wt258);
__m512 tmp5701 = _mm512_shuffle_ps(tmp5685, tmp5687, 68);
__m512 tmp5702 = _mm512_shuffle_ps(tmp5685, tmp5687, 238);
__m512 tmp5703 = _mm512_shuffle_ps(tmp5686, tmp5688, 68);
__m512 tmp5704 = _mm512_shuffle_ps(tmp5686, tmp5688, 238);
__m512 tmp5705 = _mm512_shuffle_ps(tmp5689, tmp5691, 68);
__m512 tmp5706 = _mm512_shuffle_ps(tmp5689, tmp5691, 238);
__m512 tmp5707 = _mm512_shuffle_ps(tmp5690, tmp5692, 68);
__m512 tmp5708 = _mm512_shuffle_ps(tmp5690, tmp5692, 238);
__m512 tmp5709 = _mm512_shuffle_ps(tmp5693, tmp5695, 68);
__m512 tmp5710 = _mm512_shuffle_ps(tmp5693, tmp5695, 238);
__m512 tmp5711 = _mm512_shuffle_ps(tmp5694, tmp5696, 68);
__m512 tmp5712 = _mm512_shuffle_ps(tmp5694, tmp5696, 238);
__m512 tmp5713 = _mm512_shuffle_ps(tmp5697, tmp5699, 68);
__m512 tmp5714 = _mm512_shuffle_ps(tmp5697, tmp5699, 238);
__m512 tmp5715 = _mm512_shuffle_ps(tmp5698, tmp5700, 68);
__m512 tmp5716 = _mm512_shuffle_ps(tmp5698, tmp5700, 238);
__m512 tmp5717 = _mm512_shuffle_f32x4(tmp5701, tmp5705, 136);
__m512 tmp5718 = _mm512_shuffle_f32x4(tmp5701, tmp5705, 221);
__m512 tmp5719 = _mm512_shuffle_f32x4(tmp5702, tmp5706, 136);
__m512 tmp5720 = _mm512_shuffle_f32x4(tmp5702, tmp5706, 221);
__m512 tmp5721 = _mm512_shuffle_f32x4(tmp5703, tmp5707, 136);
__m512 tmp5722 = _mm512_shuffle_f32x4(tmp5703, tmp5707, 221);
__m512 tmp5723 = _mm512_shuffle_f32x4(tmp5704, tmp5708, 136);
__m512 tmp5724 = _mm512_shuffle_f32x4(tmp5704, tmp5708, 221);
__m512 tmp5725 = _mm512_shuffle_f32x4(tmp5709, tmp5713, 136);
__m512 tmp5726 = _mm512_shuffle_f32x4(tmp5709, tmp5713, 221);
__m512 tmp5727 = _mm512_shuffle_f32x4(tmp5710, tmp5714, 136);
__m512 tmp5728 = _mm512_shuffle_f32x4(tmp5710, tmp5714, 221);
__m512 tmp5729 = _mm512_shuffle_f32x4(tmp5711, tmp5715, 136);
__m512 tmp5730 = _mm512_shuffle_f32x4(tmp5711, tmp5715, 221);
__m512 tmp5731 = _mm512_shuffle_f32x4(tmp5712, tmp5716, 136);
__m512 tmp5732 = _mm512_shuffle_f32x4(tmp5712, tmp5716, 221);
wt243 = _mm512_shuffle_f32x4(tmp5717, tmp5725, 136);
wt251 = _mm512_shuffle_f32x4(tmp5717, tmp5725, 221);
wt244 = _mm512_shuffle_f32x4(tmp5719, tmp5727, 136);
wt252 = _mm512_shuffle_f32x4(tmp5719, tmp5727, 221);
wt245 = _mm512_shuffle_f32x4(tmp5721, tmp5729, 136);
wt253 = _mm512_shuffle_f32x4(tmp5721, tmp5729, 221);
wt246 = _mm512_shuffle_f32x4(tmp5723, tmp5731, 136);
wt254 = _mm512_shuffle_f32x4(tmp5723, tmp5731, 221);
wt247 = _mm512_shuffle_f32x4(tmp5718, tmp5726, 136);
wt255 = _mm512_shuffle_f32x4(tmp5718, tmp5726, 221);
wt248 = _mm512_shuffle_f32x4(tmp5720, tmp5728, 136);
wt256 = _mm512_shuffle_f32x4(tmp5720, tmp5728, 221);
wt249 = _mm512_shuffle_f32x4(tmp5722, tmp5730, 136);
wt257 = _mm512_shuffle_f32x4(tmp5722, tmp5730, 221);
wt250 = _mm512_shuffle_f32x4(tmp5724, tmp5732, 136);
wt258 = _mm512_shuffle_f32x4(tmp5724, tmp5732, 221);
wt243 = _mm512_mul_ps(wt243, postMul16);
wt244 = _mm512_mul_ps(wt244, postMul16);
wt245 = _mm512_mul_ps(wt245, postMul16);
wt246 = _mm512_mul_ps(wt246, postMul16);
wt247 = _mm512_mul_ps(wt247, postMul16);
wt248 = _mm512_mul_ps(wt248, postMul16);
wt249 = _mm512_mul_ps(wt249, postMul16);
wt250 = _mm512_mul_ps(wt250, postMul16);
wt251 = _mm512_mul_ps(wt251, postMul16);
wt252 = _mm512_mul_ps(wt252, postMul16);
wt253 = _mm512_mul_ps(wt253, postMul16);
wt254 = _mm512_mul_ps(wt254, postMul16);
wt255 = _mm512_mul_ps(wt255, postMul16);
wt256 = _mm512_mul_ps(wt256, postMul16);
wt257 = _mm512_mul_ps(wt257, postMul16);
wt258 = _mm512_mul_ps(wt258, postMul16);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(1+16*c22)+(ptrdiff_t)0, 63>>cut10, wt243);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(2+16*c22)+(ptrdiff_t)0, 63>>cut10, wt244);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(3+16*c22)+(ptrdiff_t)0, 63>>cut10, wt245);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(4+16*c22)+(ptrdiff_t)0, 63>>cut10, wt246);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(5+16*c22)+(ptrdiff_t)0, 63>>cut10, wt247);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(6+16*c22)+(ptrdiff_t)0, 63>>cut10, wt248);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(7+16*c22)+(ptrdiff_t)0, 63>>cut10, wt249);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(8+16*c22)+(ptrdiff_t)0, 63>>cut10, wt250);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(9+16*c22)+(ptrdiff_t)0, 63>>cut10, wt251);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(10+16*c22)+(ptrdiff_t)0, 63>>cut10, wt252);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(11+16*c22)+(ptrdiff_t)0, 63>>cut10, wt253);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(12+16*c22)+(ptrdiff_t)0, 63>>cut10, wt254);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(13+16*c22)+(ptrdiff_t)0, 63>>cut10, wt255);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(14+16*c22)+(ptrdiff_t)0, 63>>cut10, wt256);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(15+16*c22)+(ptrdiff_t)0, 63>>cut10, wt257);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(16+16*c22)+(ptrdiff_t)0, 63>>cut10, wt258);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(1+16*c22)+(ptrdiff_t)4608, 4032>>cut10, wt243);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(2+16*c22)+(ptrdiff_t)4608, 4032>>cut10, wt244);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(3+16*c22)+(ptrdiff_t)4608, 4032>>cut10, wt245);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(4+16*c22)+(ptrdiff_t)4608, 4032>>cut10, wt246);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(5+16*c22)+(ptrdiff_t)4608, 4032>>cut10, wt247);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(6+16*c22)+(ptrdiff_t)4608, 4032>>cut10, wt248);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(7+16*c22)+(ptrdiff_t)4608, 4032>>cut10, wt249);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(8+16*c22)+(ptrdiff_t)4608, 4032>>cut10, wt250);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(9+16*c22)+(ptrdiff_t)4608, 4032>>cut10, wt251);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(10+16*c22)+(ptrdiff_t)4608, 4032>>cut10, wt252);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(11+16*c22)+(ptrdiff_t)4608, 4032>>cut10, wt253);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(12+16*c22)+(ptrdiff_t)4608, 4032>>cut10, wt254);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(13+16*c22)+(ptrdiff_t)4608, 4032>>cut10, wt255);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(14+16*c22)+(ptrdiff_t)4608, 4032>>cut10, wt256);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(15+16*c22)+(ptrdiff_t)4608, 4032>>cut10, wt257);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(16+16*c22)+(ptrdiff_t)4608, 4032>>cut10, wt258);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(1+16*c22)+(ptrdiff_t)9216, 258048>>cut10, wt243);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(2+16*c22)+(ptrdiff_t)9216, 258048>>cut10, wt244);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(3+16*c22)+(ptrdiff_t)9216, 258048>>cut10, wt245);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(4+16*c22)+(ptrdiff_t)9216, 258048>>cut10, wt246);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(5+16*c22)+(ptrdiff_t)9216, 258048>>cut10, wt247);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(6+16*c22)+(ptrdiff_t)9216, 258048>>cut10, wt248);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(7+16*c22)+(ptrdiff_t)9216, 258048>>cut10, wt249);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(8+16*c22)+(ptrdiff_t)9216, 258048>>cut10, wt250);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(9+16*c22)+(ptrdiff_t)9216, 258048>>cut10, wt251);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(10+16*c22)+(ptrdiff_t)9216, 258048>>cut10, wt252);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(11+16*c22)+(ptrdiff_t)9216, 258048>>cut10, wt253);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(12+16*c22)+(ptrdiff_t)9216, 258048>>cut10, wt254);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(13+16*c22)+(ptrdiff_t)9216, 258048>>cut10, wt255);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(14+16*c22)+(ptrdiff_t)9216, 258048>>cut10, wt256);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(15+16*c22)+(ptrdiff_t)9216, 258048>>cut10, wt257);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+24*(16+16*c22)+(ptrdiff_t)9216, 258048>>cut10, wt258);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+8*(1+16*c22)+(ptrdiff_t)13824, 65535-(262143>>cut10), wt243);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+8*(2+16*c22)+(ptrdiff_t)13824, 65535-(262143>>cut10), wt244);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+8*(3+16*c22)+(ptrdiff_t)13824, 65535-(262143>>cut10), wt245);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+8*(4+16*c22)+(ptrdiff_t)13824, 65535-(262143>>cut10), wt246);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+8*(5+16*c22)+(ptrdiff_t)13824, 65535-(262143>>cut10), wt247);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+8*(6+16*c22)+(ptrdiff_t)13824, 65535-(262143>>cut10), wt248);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+8*(7+16*c22)+(ptrdiff_t)13824, 65535-(262143>>cut10), wt249);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+8*(8+16*c22)+(ptrdiff_t)13824, 65535-(262143>>cut10), wt250);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+8*(9+16*c22)+(ptrdiff_t)13824, 65535-(262143>>cut10), wt251);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+8*(10+16*c22)+(ptrdiff_t)13824, 65535-(262143>>cut10), wt252);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+8*(11+16*c22)+(ptrdiff_t)13824, 65535-(262143>>cut10), wt253);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+8*(12+16*c22)+(ptrdiff_t)13824, 65535-(262143>>cut10), wt254);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+8*(13+16*c22)+(ptrdiff_t)13824, 65535-(262143>>cut10), wt255);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+8*(14+16*c22)+(ptrdiff_t)13824, 65535-(262143>>cut10), wt256);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+8*(15+16*c22)+(ptrdiff_t)13824, 65535-(262143>>cut10), wt257);
_mm512_mask_storeu_ps(arranged9+98816*i31+4632*l28+4*cut10+8*(16+16*c22)+(ptrdiff_t)13824, 65535-(262143>>cut10), wt258);
}
}
}
}
}

static void DenseNet121OneArrangeWts5(DenseNet121ThreaderTeam1* team35, char** tensors43) {
DenseNet121ThreaderTask1 task47;
task47.callee1 = DenseNet121OneArrangeWts5Callee1;
task47.any1 = tensors43;
task47.nd1 = 3;
task47.hull1[0] = 2;
task47.hull1[1] = 1;
task47.hull1[2] = 1;
DenseNet121ThreaderDo1(team35, &task47);
}

static void DenseNet121OneArrangeDats5Callee1(DenseNet121ThreaderTask1* task48, int64_t* pt29) {
char** tensors46 = task48->any1;
ptrdiff_t c25 = pt29[1];
char*restrict datPtr13 = tensors46[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)2420736*0;
char*restrict bnPtr11 = tensors46[1]+(ptrdiff_t)8*((ptrdiff_t)835*0+(ptrdiff_t)192*0);
char*restrict arranged10 = tensors46[2]+(ptrdiff_t)10474240*0+(ptrdiff_t)2408448*0;
ptrdiff_t ii14 = 1;
for (ptrdiff_t i32 = 0; i32 < ii14; ++i32) {
ptrdiff_t j24 = 1*c25;
ptrdiff_t jj33 = j24+0;
for (; j24 != 49; ++j24) {
ptrdiff_t k84 = 0;
ptrdiff_t kk33 = k84+192;
for (; k84 < kk33; ++k84) {
__m512 dat1307 = _mm512_maskz_loadu_ps(65535, datPtr13+2420736*i32+256*j24+12608*k84+(ptrdiff_t)0);
__m512 dat1308 = _mm512_maskz_loadu_ps(65535, datPtr13+2420736*i32+256*j24+12608*k84+(ptrdiff_t)64);
__m512 dat1309 = _mm512_maskz_loadu_ps(65535, datPtr13+2420736*i32+256*j24+12608*k84+(ptrdiff_t)128);
__m512 dat1310 = _mm512_maskz_loadu_ps(65535, datPtr13+2420736*i32+256*j24+12608*k84+(ptrdiff_t)192);
__m512 bnMul5 = _mm512_set1_ps(((float*)bnPtr11+(ptrdiff_t)2*(k84+192*i32))[0]);
__m512 bnAdd5 = _mm512_set1_ps(((float*)bnPtr11+(ptrdiff_t)2*(k84+192*i32))[1]);
dat1307 = _mm512_fmadd_ps(dat1307, bnMul5, bnAdd5);
dat1308 = _mm512_fmadd_ps(dat1308, bnMul5, bnAdd5);
dat1309 = _mm512_fmadd_ps(dat1309, bnMul5, bnAdd5);
dat1310 = _mm512_fmadd_ps(dat1310, bnMul5, bnAdd5);
dat1307 = _mm512_max_ps(_mm512_setzero_ps(), dat1307);
dat1308 = _mm512_max_ps(_mm512_setzero_ps(), dat1308);
dat1309 = _mm512_max_ps(_mm512_setzero_ps(), dat1309);
dat1310 = _mm512_max_ps(_mm512_setzero_ps(), dat1310);
_mm512_mask_storeu_ps(arranged10+2408448*i32+49152*j24+256*k84+(ptrdiff_t)0, 65535, dat1307);
_mm512_mask_storeu_ps(arranged10+2408448*i32+49152*j24+256*k84+(ptrdiff_t)64, 65535, dat1308);
_mm512_mask_storeu_ps(arranged10+2408448*i32+49152*j24+256*k84+(ptrdiff_t)128, 65535, dat1309);
_mm512_mask_storeu_ps(arranged10+2408448*i32+49152*j24+256*k84+(ptrdiff_t)192, 65535, dat1310);
}
if (j24 >= jj33) goto next5;
}
next5:;
}
}

static void DenseNet121OneArrangeDats5(DenseNet121ThreaderTeam1* team36, char** tensors45) {
DenseNet121ThreaderTask1 task49;
task49.callee1 = DenseNet121OneArrangeDats5Callee1;
task49.any1 = tensors45;
task49.nd1 = 4;
task49.hull1[0] = 1;
task49.hull1[1] = 49;
task49.hull1[2] = 1;
task49.hull1[3] = 1;
DenseNet121ThreaderDo1(team36, &task49);
}

static void DenseNet121OneApply5Callee1(DenseNet121ThreaderTask1* task50, int64_t* pt30) {
void** pair12 = task50->any1;
char** tensors48 = pair12[0];
ptrdiff_t e13 = 0;
ptrdiff_t g14 = 0;
ptrdiff_t d9 = pt30[1];
ptrdiff_t w37 = pt30[0];
char*restrict arrangedWts5 = tensors48[0]+428032*e13+(ptrdiff_t)98816*1*g14;
char*restrict arrangedDats5 = tensors48[1]+10474240*e13+(ptrdiff_t)2408448*1*g14;
char*restrict datPtr14 = tensors48[2]+(ptrdiff_t)1613824*1*g14;
ptrdiff_t ii15 = 1;
for (ptrdiff_t i33 = 0; i33 < ii15; ++i33) {
ptrdiff_t j25 = 1*d9;
ptrdiff_t jj34 = j25+0;
for (; j25 != 49; ++j25) {
ptrdiff_t k85 = 3*w37;
ptrdiff_t kk34 = k85+(w37 < 6 ? 2 : 3);
for (; k85 != 21; ++k85) {
ptrdiff_t s20 = -1;
__m512 sum185 = _mm512_set1_ps(*(float*)(arrangedWts5+98816*i33+4632*k85+24*s20+(ptrdiff_t)24));
__m512 sum189 = _mm512_set1_ps(*(float*)(arrangedWts5+98816*i33+4632*k85+24*s20+(ptrdiff_t)28));
__m512 sum193 = _mm512_set1_ps(*(float*)(arrangedWts5+98816*i33+4632*k85+24*s20+(ptrdiff_t)32));
__m512 sum197 = _mm512_set1_ps(*(float*)(arrangedWts5+98816*i33+4632*k85+24*s20+(ptrdiff_t)36));
__m512 sum201 = _mm512_set1_ps(*(float*)(arrangedWts5+98816*i33+4632*k85+24*s20+(ptrdiff_t)40));
__m512 sum205 = _mm512_set1_ps(*(float*)(arrangedWts5+98816*i33+4632*k85+24*s20+(ptrdiff_t)44));
__m512 sum186 = sum185;
__m512 sum187 = sum185;
__m512 sum188 = sum185;
__m512 sum190 = sum189;
__m512 sum191 = sum189;
__m512 sum192 = sum189;
__m512 sum194 = sum193;
__m512 sum195 = sum193;
__m512 sum196 = sum193;
__m512 sum198 = sum197;
__m512 sum199 = sum197;
__m512 sum200 = sum197;
__m512 sum202 = sum201;
__m512 sum203 = sum201;
__m512 sum204 = sum201;
__m512 sum206 = sum205;
__m512 sum207 = sum205;
__m512 sum208 = sum205;
for (s20 = 0; s20 < 192; ++s20) {
__m512 dat1311 = _mm512_loadu_ps(arrangedDats5+2408448*i33+49152*j25+256*s20+(ptrdiff_t)0);
__m512 dat1312 = _mm512_loadu_ps(arrangedDats5+2408448*i33+49152*j25+256*s20+(ptrdiff_t)64);
__m512 dat1313 = _mm512_loadu_ps(arrangedDats5+2408448*i33+49152*j25+256*s20+(ptrdiff_t)128);
__m512 dat1314 = _mm512_loadu_ps(arrangedDats5+2408448*i33+49152*j25+256*s20+(ptrdiff_t)192);
__m512 wt291 = _mm512_set1_ps(*(float*)(arrangedWts5+98816*i33+4632*k85+24*s20+(ptrdiff_t)24));
sum185 = _mm512_fmadd_ps(wt291, dat1311, sum185);
sum186 = _mm512_fmadd_ps(wt291, dat1312, sum186);
sum187 = _mm512_fmadd_ps(wt291, dat1313, sum187);
sum188 = _mm512_fmadd_ps(wt291, dat1314, sum188);
__m512 wt292 = _mm512_set1_ps(*(float*)(arrangedWts5+98816*i33+4632*k85+24*s20+(ptrdiff_t)28));
sum189 = _mm512_fmadd_ps(wt292, dat1311, sum189);
sum190 = _mm512_fmadd_ps(wt292, dat1312, sum190);
sum191 = _mm512_fmadd_ps(wt292, dat1313, sum191);
sum192 = _mm512_fmadd_ps(wt292, dat1314, sum192);
__m512 wt293 = _mm512_set1_ps(*(float*)(arrangedWts5+98816*i33+4632*k85+24*s20+(ptrdiff_t)32));
sum193 = _mm512_fmadd_ps(wt293, dat1311, sum193);
sum194 = _mm512_fmadd_ps(wt293, dat1312, sum194);
sum195 = _mm512_fmadd_ps(wt293, dat1313, sum195);
sum196 = _mm512_fmadd_ps(wt293, dat1314, sum196);
__m512 wt294 = _mm512_set1_ps(*(float*)(arrangedWts5+98816*i33+4632*k85+24*s20+(ptrdiff_t)36));
sum197 = _mm512_fmadd_ps(wt294, dat1311, sum197);
sum198 = _mm512_fmadd_ps(wt294, dat1312, sum198);
sum199 = _mm512_fmadd_ps(wt294, dat1313, sum199);
sum200 = _mm512_fmadd_ps(wt294, dat1314, sum200);
__m512 wt295 = _mm512_set1_ps(*(float*)(arrangedWts5+98816*i33+4632*k85+24*s20+(ptrdiff_t)40));
sum201 = _mm512_fmadd_ps(wt295, dat1311, sum201);
sum202 = _mm512_fmadd_ps(wt295, dat1312, sum202);
sum203 = _mm512_fmadd_ps(wt295, dat1313, sum203);
sum204 = _mm512_fmadd_ps(wt295, dat1314, sum204);
__m512 wt296 = _mm512_set1_ps(*(float*)(arrangedWts5+98816*i33+4632*k85+24*s20+(ptrdiff_t)44));
sum205 = _mm512_fmadd_ps(wt296, dat1311, sum205);
sum206 = _mm512_fmadd_ps(wt296, dat1312, sum206);
sum207 = _mm512_fmadd_ps(wt296, dat1313, sum207);
sum208 = _mm512_fmadd_ps(wt296, dat1314, sum208);
}
sum185 = _mm512_max_ps(_mm512_setzero_ps(), sum185);
sum186 = _mm512_max_ps(_mm512_setzero_ps(), sum186);
sum187 = _mm512_max_ps(_mm512_setzero_ps(), sum187);
sum188 = _mm512_max_ps(_mm512_setzero_ps(), sum188);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)0, 65535, sum185);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)64, 65535, sum186);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)128, 65535, sum187);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)192, 65535, sum188);
sum189 = _mm512_max_ps(_mm512_setzero_ps(), sum189);
sum190 = _mm512_max_ps(_mm512_setzero_ps(), sum190);
sum191 = _mm512_max_ps(_mm512_setzero_ps(), sum191);
sum192 = _mm512_max_ps(_mm512_setzero_ps(), sum192);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)12608, 65535, sum189);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)12672, 65535, sum190);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)12736, 65535, sum191);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)12800, 65535, sum192);
sum193 = _mm512_max_ps(_mm512_setzero_ps(), sum193);
sum194 = _mm512_max_ps(_mm512_setzero_ps(), sum194);
sum195 = _mm512_max_ps(_mm512_setzero_ps(), sum195);
sum196 = _mm512_max_ps(_mm512_setzero_ps(), sum196);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)25216, 65535, sum193);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)25280, 65535, sum194);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)25344, 65535, sum195);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)25408, 65535, sum196);
sum197 = _mm512_max_ps(_mm512_setzero_ps(), sum197);
sum198 = _mm512_max_ps(_mm512_setzero_ps(), sum198);
sum199 = _mm512_max_ps(_mm512_setzero_ps(), sum199);
sum200 = _mm512_max_ps(_mm512_setzero_ps(), sum200);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)37824, 65535, sum197);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)37888, 65535, sum198);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)37952, 65535, sum199);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)38016, 65535, sum200);
sum201 = _mm512_max_ps(_mm512_setzero_ps(), sum201);
sum202 = _mm512_max_ps(_mm512_setzero_ps(), sum202);
sum203 = _mm512_max_ps(_mm512_setzero_ps(), sum203);
sum204 = _mm512_max_ps(_mm512_setzero_ps(), sum204);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)50432, 65535, sum201);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)50496, 65535, sum202);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)50560, 65535, sum203);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)50624, 65535, sum204);
sum205 = _mm512_max_ps(_mm512_setzero_ps(), sum205);
sum206 = _mm512_max_ps(_mm512_setzero_ps(), sum206);
sum207 = _mm512_max_ps(_mm512_setzero_ps(), sum207);
sum208 = _mm512_max_ps(_mm512_setzero_ps(), sum208);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)63040, 65535, sum205);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)63104, 65535, sum206);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)63168, 65535, sum207);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)63232, 65535, sum208);
if (k85 >= kk34) return;
}
ptrdiff_t s21 = -1;
__m512 sum209 = _mm512_set1_ps(*(float*)(arrangedWts5+98816*i33+4632*k85+8*s21+(ptrdiff_t)8));
__m512 sum213 = _mm512_set1_ps(*(float*)(arrangedWts5+98816*i33+4632*k85+8*s21+(ptrdiff_t)12));
__m512 sum210 = sum209;
__m512 sum211 = sum209;
__m512 sum212 = sum209;
__m512 sum214 = sum213;
__m512 sum215 = sum213;
__m512 sum216 = sum213;
for (s21 = 0; s21 < 192; ++s21) {
__m512 dat1315 = _mm512_loadu_ps(arrangedDats5+2408448*i33+49152*j25+256*s21+(ptrdiff_t)0);
__m512 dat1316 = _mm512_loadu_ps(arrangedDats5+2408448*i33+49152*j25+256*s21+(ptrdiff_t)64);
__m512 dat1317 = _mm512_loadu_ps(arrangedDats5+2408448*i33+49152*j25+256*s21+(ptrdiff_t)128);
__m512 dat1318 = _mm512_loadu_ps(arrangedDats5+2408448*i33+49152*j25+256*s21+(ptrdiff_t)192);
__m512 wt297 = _mm512_set1_ps(*(float*)(arrangedWts5+98816*i33+4632*k85+8*s21+(ptrdiff_t)8));
sum209 = _mm512_fmadd_ps(wt297, dat1315, sum209);
sum210 = _mm512_fmadd_ps(wt297, dat1316, sum210);
sum211 = _mm512_fmadd_ps(wt297, dat1317, sum211);
sum212 = _mm512_fmadd_ps(wt297, dat1318, sum212);
__m512 wt298 = _mm512_set1_ps(*(float*)(arrangedWts5+98816*i33+4632*k85+8*s21+(ptrdiff_t)12));
sum213 = _mm512_fmadd_ps(wt298, dat1315, sum213);
sum214 = _mm512_fmadd_ps(wt298, dat1316, sum214);
sum215 = _mm512_fmadd_ps(wt298, dat1317, sum215);
sum216 = _mm512_fmadd_ps(wt298, dat1318, sum216);
}
sum209 = _mm512_max_ps(_mm512_setzero_ps(), sum209);
sum210 = _mm512_max_ps(_mm512_setzero_ps(), sum210);
sum211 = _mm512_max_ps(_mm512_setzero_ps(), sum211);
sum212 = _mm512_max_ps(_mm512_setzero_ps(), sum212);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)0, 65535, sum209);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)64, 65535, sum210);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)128, 65535, sum211);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)192, 65535, sum212);
sum213 = _mm512_max_ps(_mm512_setzero_ps(), sum213);
sum214 = _mm512_max_ps(_mm512_setzero_ps(), sum214);
sum215 = _mm512_max_ps(_mm512_setzero_ps(), sum215);
sum216 = _mm512_max_ps(_mm512_setzero_ps(), sum216);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)12608, 65535, sum213);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)12672, 65535, sum214);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)12736, 65535, sum215);
_mm512_mask_storeu_ps(datPtr14+1613824*i33+256*j25+75648*k85+(ptrdiff_t)12800, 65535, sum216);
if (j25 >= jj34) return;
}
}
}

static void DenseNet121OneApply5(DenseNet121ThreaderTeam1* team37, char** tensors47) {
void* pair11[] = {tensors47, 0};
DenseNet121ThreaderTask1 task51;
task51.callee1 = DenseNet121OneApply5Callee1;
task51.any1 = pair11;
task51.nd1 = 3;
task51.hull1[0] = 7;
task51.hull1[1] = 49;
task51.hull1[2] = 1;
DenseNet121ThreaderDo1(team37, &task51);
}

static void DenseNet121OneArrangeWts6Callee1(DenseNet121ThreaderTask1* task52, int64_t* pt31) {
char** tensors50 = task52->any1;
ptrdiff_t b50 = pt31[0];
char*restrict wtPtr8 = tensors50[0]+(ptrdiff_t)3340*0+(ptrdiff_t)114688*0;
char*restrict biasPtr8 = tensors50[1]+(ptrdiff_t)512*0;
char*restrict bnPtr12 = tensors50[2]+(ptrdiff_t)8*128*0;
char*restrict arranged11 = tensors50[3]+(ptrdiff_t)428032*0+(ptrdiff_t)115200*0;
ptrdiff_t ii16 = 1;
for (ptrdiff_t i35 = 0; i35 < ii16; ++i35) {
ptrdiff_t j26 = 4*b50;
ptrdiff_t jj35 = j26+4;
for (; j26 < jj35; ++j26) {
if (j26 < 7) {
ptrdiff_t k87 = 0+16*(j26-0);
ptrdiff_t l31 = (size_t)(0+k87)/6;
ptrdiff_t cut13 = (size_t)(0+k87)%6;
switch (cut13) {
case 0:;
case 2: {
__m512 sum218 = _mm512_maskz_loadu_ps(65535, biasPtr8+512*i35+4*k87);
__m512i pmMul17 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd17 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo16 = _mm512_loadu_ps(bnPtr12+(ptrdiff_t)8*(k87+128*i35));
__m512 masHi16 = _mm512_maskz_loadu_ps(65535, bnPtr12+(ptrdiff_t)8*(k87+128*i35)+(ptrdiff_t)64);
__m512 postMul20 = _mm512_permutex2var_ps(masLo16, pmMul17, masHi16);
__m512 postAdd18 = _mm512_permutex2var_ps(masLo16, pmAdd17, masHi16);
sum218 = _mm512_fmadd_ps(sum218, postMul20, postAdd18);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*0+(ptrdiff_t)0, 63>>cut13, sum218);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*0+(ptrdiff_t)5376, 4032>>cut13, sum218);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*0+(ptrdiff_t)10752, 65535-(4095>>cut13), sum218);
ptrdiff_t c27 = 0;
for (; c27 != 14; ++c27) {
__m512 wt315 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c27+(ptrdiff_t)0);
__m512 wt316 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c27+(ptrdiff_t)896);
__m512 wt317 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c27+(ptrdiff_t)1792);
__m512 wt318 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c27+(ptrdiff_t)2688);
__m512 wt319 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c27+(ptrdiff_t)3584);
__m512 wt320 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c27+(ptrdiff_t)4480);
__m512 wt321 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c27+(ptrdiff_t)5376);
__m512 wt322 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c27+(ptrdiff_t)6272);
__m512 wt323 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c27+(ptrdiff_t)7168);
__m512 wt324 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c27+(ptrdiff_t)8064);
__m512 wt325 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c27+(ptrdiff_t)8960);
__m512 wt326 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c27+(ptrdiff_t)9856);
__m512 wt327 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c27+(ptrdiff_t)10752);
__m512 wt328 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c27+(ptrdiff_t)11648);
__m512 wt329 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c27+(ptrdiff_t)12544);
__m512 wt330 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c27+(ptrdiff_t)13440);
__m512 tmp5733 = _mm512_unpacklo_ps(wt315, wt316);
__m512 tmp5734 = _mm512_unpackhi_ps(wt315, wt316);
__m512 tmp5735 = _mm512_unpacklo_ps(wt317, wt318);
__m512 tmp5736 = _mm512_unpackhi_ps(wt317, wt318);
__m512 tmp5737 = _mm512_unpacklo_ps(wt319, wt320);
__m512 tmp5738 = _mm512_unpackhi_ps(wt319, wt320);
__m512 tmp5739 = _mm512_unpacklo_ps(wt321, wt322);
__m512 tmp5740 = _mm512_unpackhi_ps(wt321, wt322);
__m512 tmp5741 = _mm512_unpacklo_ps(wt323, wt324);
__m512 tmp5742 = _mm512_unpackhi_ps(wt323, wt324);
__m512 tmp5743 = _mm512_unpacklo_ps(wt325, wt326);
__m512 tmp5744 = _mm512_unpackhi_ps(wt325, wt326);
__m512 tmp5745 = _mm512_unpacklo_ps(wt327, wt328);
__m512 tmp5746 = _mm512_unpackhi_ps(wt327, wt328);
__m512 tmp5747 = _mm512_unpacklo_ps(wt329, wt330);
__m512 tmp5748 = _mm512_unpackhi_ps(wt329, wt330);
__m512 tmp5749 = _mm512_shuffle_ps(tmp5733, tmp5735, 68);
__m512 tmp5750 = _mm512_shuffle_ps(tmp5733, tmp5735, 238);
__m512 tmp5751 = _mm512_shuffle_ps(tmp5734, tmp5736, 68);
__m512 tmp5752 = _mm512_shuffle_ps(tmp5734, tmp5736, 238);
__m512 tmp5753 = _mm512_shuffle_ps(tmp5737, tmp5739, 68);
__m512 tmp5754 = _mm512_shuffle_ps(tmp5737, tmp5739, 238);
__m512 tmp5755 = _mm512_shuffle_ps(tmp5738, tmp5740, 68);
__m512 tmp5756 = _mm512_shuffle_ps(tmp5738, tmp5740, 238);
__m512 tmp5757 = _mm512_shuffle_ps(tmp5741, tmp5743, 68);
__m512 tmp5758 = _mm512_shuffle_ps(tmp5741, tmp5743, 238);
__m512 tmp5759 = _mm512_shuffle_ps(tmp5742, tmp5744, 68);
__m512 tmp5760 = _mm512_shuffle_ps(tmp5742, tmp5744, 238);
__m512 tmp5761 = _mm512_shuffle_ps(tmp5745, tmp5747, 68);
__m512 tmp5762 = _mm512_shuffle_ps(tmp5745, tmp5747, 238);
__m512 tmp5763 = _mm512_shuffle_ps(tmp5746, tmp5748, 68);
__m512 tmp5764 = _mm512_shuffle_ps(tmp5746, tmp5748, 238);
__m512 tmp5765 = _mm512_shuffle_f32x4(tmp5749, tmp5753, 136);
__m512 tmp5766 = _mm512_shuffle_f32x4(tmp5749, tmp5753, 221);
__m512 tmp5767 = _mm512_shuffle_f32x4(tmp5750, tmp5754, 136);
__m512 tmp5768 = _mm512_shuffle_f32x4(tmp5750, tmp5754, 221);
__m512 tmp5769 = _mm512_shuffle_f32x4(tmp5751, tmp5755, 136);
__m512 tmp5770 = _mm512_shuffle_f32x4(tmp5751, tmp5755, 221);
__m512 tmp5771 = _mm512_shuffle_f32x4(tmp5752, tmp5756, 136);
__m512 tmp5772 = _mm512_shuffle_f32x4(tmp5752, tmp5756, 221);
__m512 tmp5773 = _mm512_shuffle_f32x4(tmp5757, tmp5761, 136);
__m512 tmp5774 = _mm512_shuffle_f32x4(tmp5757, tmp5761, 221);
__m512 tmp5775 = _mm512_shuffle_f32x4(tmp5758, tmp5762, 136);
__m512 tmp5776 = _mm512_shuffle_f32x4(tmp5758, tmp5762, 221);
__m512 tmp5777 = _mm512_shuffle_f32x4(tmp5759, tmp5763, 136);
__m512 tmp5778 = _mm512_shuffle_f32x4(tmp5759, tmp5763, 221);
__m512 tmp5779 = _mm512_shuffle_f32x4(tmp5760, tmp5764, 136);
__m512 tmp5780 = _mm512_shuffle_f32x4(tmp5760, tmp5764, 221);
wt315 = _mm512_shuffle_f32x4(tmp5765, tmp5773, 136);
wt323 = _mm512_shuffle_f32x4(tmp5765, tmp5773, 221);
wt316 = _mm512_shuffle_f32x4(tmp5767, tmp5775, 136);
wt324 = _mm512_shuffle_f32x4(tmp5767, tmp5775, 221);
wt317 = _mm512_shuffle_f32x4(tmp5769, tmp5777, 136);
wt325 = _mm512_shuffle_f32x4(tmp5769, tmp5777, 221);
wt318 = _mm512_shuffle_f32x4(tmp5771, tmp5779, 136);
wt326 = _mm512_shuffle_f32x4(tmp5771, tmp5779, 221);
wt319 = _mm512_shuffle_f32x4(tmp5766, tmp5774, 136);
wt327 = _mm512_shuffle_f32x4(tmp5766, tmp5774, 221);
wt320 = _mm512_shuffle_f32x4(tmp5768, tmp5776, 136);
wt328 = _mm512_shuffle_f32x4(tmp5768, tmp5776, 221);
wt321 = _mm512_shuffle_f32x4(tmp5770, tmp5778, 136);
wt329 = _mm512_shuffle_f32x4(tmp5770, tmp5778, 221);
wt322 = _mm512_shuffle_f32x4(tmp5772, tmp5780, 136);
wt330 = _mm512_shuffle_f32x4(tmp5772, tmp5780, 221);
wt315 = _mm512_mul_ps(wt315, postMul20);
wt316 = _mm512_mul_ps(wt316, postMul20);
wt317 = _mm512_mul_ps(wt317, postMul20);
wt318 = _mm512_mul_ps(wt318, postMul20);
wt319 = _mm512_mul_ps(wt319, postMul20);
wt320 = _mm512_mul_ps(wt320, postMul20);
wt321 = _mm512_mul_ps(wt321, postMul20);
wt322 = _mm512_mul_ps(wt322, postMul20);
wt323 = _mm512_mul_ps(wt323, postMul20);
wt324 = _mm512_mul_ps(wt324, postMul20);
wt325 = _mm512_mul_ps(wt325, postMul20);
wt326 = _mm512_mul_ps(wt326, postMul20);
wt327 = _mm512_mul_ps(wt327, postMul20);
wt328 = _mm512_mul_ps(wt328, postMul20);
wt329 = _mm512_mul_ps(wt329, postMul20);
wt330 = _mm512_mul_ps(wt330, postMul20);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(1+16*c27)+(ptrdiff_t)0, 63>>cut13, wt315);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(2+16*c27)+(ptrdiff_t)0, 63>>cut13, wt316);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(3+16*c27)+(ptrdiff_t)0, 63>>cut13, wt317);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(4+16*c27)+(ptrdiff_t)0, 63>>cut13, wt318);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(5+16*c27)+(ptrdiff_t)0, 63>>cut13, wt319);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(6+16*c27)+(ptrdiff_t)0, 63>>cut13, wt320);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(7+16*c27)+(ptrdiff_t)0, 63>>cut13, wt321);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(8+16*c27)+(ptrdiff_t)0, 63>>cut13, wt322);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(9+16*c27)+(ptrdiff_t)0, 63>>cut13, wt323);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(10+16*c27)+(ptrdiff_t)0, 63>>cut13, wt324);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(11+16*c27)+(ptrdiff_t)0, 63>>cut13, wt325);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(12+16*c27)+(ptrdiff_t)0, 63>>cut13, wt326);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(13+16*c27)+(ptrdiff_t)0, 63>>cut13, wt327);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(14+16*c27)+(ptrdiff_t)0, 63>>cut13, wt328);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(15+16*c27)+(ptrdiff_t)0, 63>>cut13, wt329);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(16+16*c27)+(ptrdiff_t)0, 63>>cut13, wt330);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(1+16*c27)+(ptrdiff_t)5376, 4032>>cut13, wt315);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(2+16*c27)+(ptrdiff_t)5376, 4032>>cut13, wt316);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(3+16*c27)+(ptrdiff_t)5376, 4032>>cut13, wt317);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(4+16*c27)+(ptrdiff_t)5376, 4032>>cut13, wt318);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(5+16*c27)+(ptrdiff_t)5376, 4032>>cut13, wt319);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(6+16*c27)+(ptrdiff_t)5376, 4032>>cut13, wt320);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(7+16*c27)+(ptrdiff_t)5376, 4032>>cut13, wt321);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(8+16*c27)+(ptrdiff_t)5376, 4032>>cut13, wt322);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(9+16*c27)+(ptrdiff_t)5376, 4032>>cut13, wt323);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(10+16*c27)+(ptrdiff_t)5376, 4032>>cut13, wt324);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(11+16*c27)+(ptrdiff_t)5376, 4032>>cut13, wt325);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(12+16*c27)+(ptrdiff_t)5376, 4032>>cut13, wt326);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(13+16*c27)+(ptrdiff_t)5376, 4032>>cut13, wt327);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(14+16*c27)+(ptrdiff_t)5376, 4032>>cut13, wt328);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(15+16*c27)+(ptrdiff_t)5376, 4032>>cut13, wt329);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(16+16*c27)+(ptrdiff_t)5376, 4032>>cut13, wt330);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(1+16*c27)+(ptrdiff_t)10752, 65535-(4095>>cut13), wt315);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(2+16*c27)+(ptrdiff_t)10752, 65535-(4095>>cut13), wt316);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(3+16*c27)+(ptrdiff_t)10752, 65535-(4095>>cut13), wt317);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(4+16*c27)+(ptrdiff_t)10752, 65535-(4095>>cut13), wt318);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(5+16*c27)+(ptrdiff_t)10752, 65535-(4095>>cut13), wt319);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(6+16*c27)+(ptrdiff_t)10752, 65535-(4095>>cut13), wt320);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(7+16*c27)+(ptrdiff_t)10752, 65535-(4095>>cut13), wt321);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(8+16*c27)+(ptrdiff_t)10752, 65535-(4095>>cut13), wt322);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(9+16*c27)+(ptrdiff_t)10752, 65535-(4095>>cut13), wt323);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(10+16*c27)+(ptrdiff_t)10752, 65535-(4095>>cut13), wt324);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(11+16*c27)+(ptrdiff_t)10752, 65535-(4095>>cut13), wt325);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(12+16*c27)+(ptrdiff_t)10752, 65535-(4095>>cut13), wt326);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(13+16*c27)+(ptrdiff_t)10752, 65535-(4095>>cut13), wt327);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(14+16*c27)+(ptrdiff_t)10752, 65535-(4095>>cut13), wt328);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(15+16*c27)+(ptrdiff_t)10752, 65535-(4095>>cut13), wt329);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(16+16*c27)+(ptrdiff_t)10752, 65535-(4095>>cut13), wt330);
}
break;
}
default: {
cut13 = 4;
__m512 sum219 = _mm512_maskz_loadu_ps(65535, biasPtr8+512*i35+4*k87);
__m512i pmMul18 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd18 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo17 = _mm512_loadu_ps(bnPtr12+(ptrdiff_t)8*(k87+128*i35));
__m512 masHi17 = _mm512_maskz_loadu_ps(65535, bnPtr12+(ptrdiff_t)8*(k87+128*i35)+(ptrdiff_t)64);
__m512 postMul21 = _mm512_permutex2var_ps(masLo17, pmMul18, masHi17);
__m512 postAdd19 = _mm512_permutex2var_ps(masLo17, pmAdd18, masHi17);
sum219 = _mm512_fmadd_ps(sum219, postMul21, postAdd19);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*0+(ptrdiff_t)0, 63>>cut13, sum219);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*0+(ptrdiff_t)5376, 4032>>cut13, sum219);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*0+(ptrdiff_t)10752, 258048>>cut13, sum219);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*0+(ptrdiff_t)16128, 65535-(262143>>cut13), sum219);
ptrdiff_t c28 = 0;
for (; c28 != 14; ++c28) {
__m512 wt331 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c28+(ptrdiff_t)0);
__m512 wt332 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c28+(ptrdiff_t)896);
__m512 wt333 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c28+(ptrdiff_t)1792);
__m512 wt334 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c28+(ptrdiff_t)2688);
__m512 wt335 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c28+(ptrdiff_t)3584);
__m512 wt336 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c28+(ptrdiff_t)4480);
__m512 wt337 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c28+(ptrdiff_t)5376);
__m512 wt338 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c28+(ptrdiff_t)6272);
__m512 wt339 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c28+(ptrdiff_t)7168);
__m512 wt340 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c28+(ptrdiff_t)8064);
__m512 wt341 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c28+(ptrdiff_t)8960);
__m512 wt342 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c28+(ptrdiff_t)9856);
__m512 wt343 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c28+(ptrdiff_t)10752);
__m512 wt344 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c28+(ptrdiff_t)11648);
__m512 wt345 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c28+(ptrdiff_t)12544);
__m512 wt346 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k87+64*c28+(ptrdiff_t)13440);
__m512 tmp5781 = _mm512_unpacklo_ps(wt331, wt332);
__m512 tmp5782 = _mm512_unpackhi_ps(wt331, wt332);
__m512 tmp5783 = _mm512_unpacklo_ps(wt333, wt334);
__m512 tmp5784 = _mm512_unpackhi_ps(wt333, wt334);
__m512 tmp5785 = _mm512_unpacklo_ps(wt335, wt336);
__m512 tmp5786 = _mm512_unpackhi_ps(wt335, wt336);
__m512 tmp5787 = _mm512_unpacklo_ps(wt337, wt338);
__m512 tmp5788 = _mm512_unpackhi_ps(wt337, wt338);
__m512 tmp5789 = _mm512_unpacklo_ps(wt339, wt340);
__m512 tmp5790 = _mm512_unpackhi_ps(wt339, wt340);
__m512 tmp5791 = _mm512_unpacklo_ps(wt341, wt342);
__m512 tmp5792 = _mm512_unpackhi_ps(wt341, wt342);
__m512 tmp5793 = _mm512_unpacklo_ps(wt343, wt344);
__m512 tmp5794 = _mm512_unpackhi_ps(wt343, wt344);
__m512 tmp5795 = _mm512_unpacklo_ps(wt345, wt346);
__m512 tmp5796 = _mm512_unpackhi_ps(wt345, wt346);
__m512 tmp5797 = _mm512_shuffle_ps(tmp5781, tmp5783, 68);
__m512 tmp5798 = _mm512_shuffle_ps(tmp5781, tmp5783, 238);
__m512 tmp5799 = _mm512_shuffle_ps(tmp5782, tmp5784, 68);
__m512 tmp5800 = _mm512_shuffle_ps(tmp5782, tmp5784, 238);
__m512 tmp5801 = _mm512_shuffle_ps(tmp5785, tmp5787, 68);
__m512 tmp5802 = _mm512_shuffle_ps(tmp5785, tmp5787, 238);
__m512 tmp5803 = _mm512_shuffle_ps(tmp5786, tmp5788, 68);
__m512 tmp5804 = _mm512_shuffle_ps(tmp5786, tmp5788, 238);
__m512 tmp5805 = _mm512_shuffle_ps(tmp5789, tmp5791, 68);
__m512 tmp5806 = _mm512_shuffle_ps(tmp5789, tmp5791, 238);
__m512 tmp5807 = _mm512_shuffle_ps(tmp5790, tmp5792, 68);
__m512 tmp5808 = _mm512_shuffle_ps(tmp5790, tmp5792, 238);
__m512 tmp5809 = _mm512_shuffle_ps(tmp5793, tmp5795, 68);
__m512 tmp5810 = _mm512_shuffle_ps(tmp5793, tmp5795, 238);
__m512 tmp5811 = _mm512_shuffle_ps(tmp5794, tmp5796, 68);
__m512 tmp5812 = _mm512_shuffle_ps(tmp5794, tmp5796, 238);
__m512 tmp5813 = _mm512_shuffle_f32x4(tmp5797, tmp5801, 136);
__m512 tmp5814 = _mm512_shuffle_f32x4(tmp5797, tmp5801, 221);
__m512 tmp5815 = _mm512_shuffle_f32x4(tmp5798, tmp5802, 136);
__m512 tmp5816 = _mm512_shuffle_f32x4(tmp5798, tmp5802, 221);
__m512 tmp5817 = _mm512_shuffle_f32x4(tmp5799, tmp5803, 136);
__m512 tmp5818 = _mm512_shuffle_f32x4(tmp5799, tmp5803, 221);
__m512 tmp5819 = _mm512_shuffle_f32x4(tmp5800, tmp5804, 136);
__m512 tmp5820 = _mm512_shuffle_f32x4(tmp5800, tmp5804, 221);
__m512 tmp5821 = _mm512_shuffle_f32x4(tmp5805, tmp5809, 136);
__m512 tmp5822 = _mm512_shuffle_f32x4(tmp5805, tmp5809, 221);
__m512 tmp5823 = _mm512_shuffle_f32x4(tmp5806, tmp5810, 136);
__m512 tmp5824 = _mm512_shuffle_f32x4(tmp5806, tmp5810, 221);
__m512 tmp5825 = _mm512_shuffle_f32x4(tmp5807, tmp5811, 136);
__m512 tmp5826 = _mm512_shuffle_f32x4(tmp5807, tmp5811, 221);
__m512 tmp5827 = _mm512_shuffle_f32x4(tmp5808, tmp5812, 136);
__m512 tmp5828 = _mm512_shuffle_f32x4(tmp5808, tmp5812, 221);
wt331 = _mm512_shuffle_f32x4(tmp5813, tmp5821, 136);
wt339 = _mm512_shuffle_f32x4(tmp5813, tmp5821, 221);
wt332 = _mm512_shuffle_f32x4(tmp5815, tmp5823, 136);
wt340 = _mm512_shuffle_f32x4(tmp5815, tmp5823, 221);
wt333 = _mm512_shuffle_f32x4(tmp5817, tmp5825, 136);
wt341 = _mm512_shuffle_f32x4(tmp5817, tmp5825, 221);
wt334 = _mm512_shuffle_f32x4(tmp5819, tmp5827, 136);
wt342 = _mm512_shuffle_f32x4(tmp5819, tmp5827, 221);
wt335 = _mm512_shuffle_f32x4(tmp5814, tmp5822, 136);
wt343 = _mm512_shuffle_f32x4(tmp5814, tmp5822, 221);
wt336 = _mm512_shuffle_f32x4(tmp5816, tmp5824, 136);
wt344 = _mm512_shuffle_f32x4(tmp5816, tmp5824, 221);
wt337 = _mm512_shuffle_f32x4(tmp5818, tmp5826, 136);
wt345 = _mm512_shuffle_f32x4(tmp5818, tmp5826, 221);
wt338 = _mm512_shuffle_f32x4(tmp5820, tmp5828, 136);
wt346 = _mm512_shuffle_f32x4(tmp5820, tmp5828, 221);
wt331 = _mm512_mul_ps(wt331, postMul21);
wt332 = _mm512_mul_ps(wt332, postMul21);
wt333 = _mm512_mul_ps(wt333, postMul21);
wt334 = _mm512_mul_ps(wt334, postMul21);
wt335 = _mm512_mul_ps(wt335, postMul21);
wt336 = _mm512_mul_ps(wt336, postMul21);
wt337 = _mm512_mul_ps(wt337, postMul21);
wt338 = _mm512_mul_ps(wt338, postMul21);
wt339 = _mm512_mul_ps(wt339, postMul21);
wt340 = _mm512_mul_ps(wt340, postMul21);
wt341 = _mm512_mul_ps(wt341, postMul21);
wt342 = _mm512_mul_ps(wt342, postMul21);
wt343 = _mm512_mul_ps(wt343, postMul21);
wt344 = _mm512_mul_ps(wt344, postMul21);
wt345 = _mm512_mul_ps(wt345, postMul21);
wt346 = _mm512_mul_ps(wt346, postMul21);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(1+16*c28)+(ptrdiff_t)0, 63>>cut13, wt331);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(2+16*c28)+(ptrdiff_t)0, 63>>cut13, wt332);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(3+16*c28)+(ptrdiff_t)0, 63>>cut13, wt333);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(4+16*c28)+(ptrdiff_t)0, 63>>cut13, wt334);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(5+16*c28)+(ptrdiff_t)0, 63>>cut13, wt335);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(6+16*c28)+(ptrdiff_t)0, 63>>cut13, wt336);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(7+16*c28)+(ptrdiff_t)0, 63>>cut13, wt337);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(8+16*c28)+(ptrdiff_t)0, 63>>cut13, wt338);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(9+16*c28)+(ptrdiff_t)0, 63>>cut13, wt339);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(10+16*c28)+(ptrdiff_t)0, 63>>cut13, wt340);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(11+16*c28)+(ptrdiff_t)0, 63>>cut13, wt341);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(12+16*c28)+(ptrdiff_t)0, 63>>cut13, wt342);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(13+16*c28)+(ptrdiff_t)0, 63>>cut13, wt343);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(14+16*c28)+(ptrdiff_t)0, 63>>cut13, wt344);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(15+16*c28)+(ptrdiff_t)0, 63>>cut13, wt345);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(16+16*c28)+(ptrdiff_t)0, 63>>cut13, wt346);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(1+16*c28)+(ptrdiff_t)5376, 4032>>cut13, wt331);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(2+16*c28)+(ptrdiff_t)5376, 4032>>cut13, wt332);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(3+16*c28)+(ptrdiff_t)5376, 4032>>cut13, wt333);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(4+16*c28)+(ptrdiff_t)5376, 4032>>cut13, wt334);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(5+16*c28)+(ptrdiff_t)5376, 4032>>cut13, wt335);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(6+16*c28)+(ptrdiff_t)5376, 4032>>cut13, wt336);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(7+16*c28)+(ptrdiff_t)5376, 4032>>cut13, wt337);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(8+16*c28)+(ptrdiff_t)5376, 4032>>cut13, wt338);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(9+16*c28)+(ptrdiff_t)5376, 4032>>cut13, wt339);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(10+16*c28)+(ptrdiff_t)5376, 4032>>cut13, wt340);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(11+16*c28)+(ptrdiff_t)5376, 4032>>cut13, wt341);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(12+16*c28)+(ptrdiff_t)5376, 4032>>cut13, wt342);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(13+16*c28)+(ptrdiff_t)5376, 4032>>cut13, wt343);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(14+16*c28)+(ptrdiff_t)5376, 4032>>cut13, wt344);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(15+16*c28)+(ptrdiff_t)5376, 4032>>cut13, wt345);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(16+16*c28)+(ptrdiff_t)5376, 4032>>cut13, wt346);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(1+16*c28)+(ptrdiff_t)10752, 258048>>cut13, wt331);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(2+16*c28)+(ptrdiff_t)10752, 258048>>cut13, wt332);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(3+16*c28)+(ptrdiff_t)10752, 258048>>cut13, wt333);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(4+16*c28)+(ptrdiff_t)10752, 258048>>cut13, wt334);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(5+16*c28)+(ptrdiff_t)10752, 258048>>cut13, wt335);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(6+16*c28)+(ptrdiff_t)10752, 258048>>cut13, wt336);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(7+16*c28)+(ptrdiff_t)10752, 258048>>cut13, wt337);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(8+16*c28)+(ptrdiff_t)10752, 258048>>cut13, wt338);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(9+16*c28)+(ptrdiff_t)10752, 258048>>cut13, wt339);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(10+16*c28)+(ptrdiff_t)10752, 258048>>cut13, wt340);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(11+16*c28)+(ptrdiff_t)10752, 258048>>cut13, wt341);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(12+16*c28)+(ptrdiff_t)10752, 258048>>cut13, wt342);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(13+16*c28)+(ptrdiff_t)10752, 258048>>cut13, wt343);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(14+16*c28)+(ptrdiff_t)10752, 258048>>cut13, wt344);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(15+16*c28)+(ptrdiff_t)10752, 258048>>cut13, wt345);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(16+16*c28)+(ptrdiff_t)10752, 258048>>cut13, wt346);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(1+16*c28)+(ptrdiff_t)16128, 65535-(262143>>cut13), wt331);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(2+16*c28)+(ptrdiff_t)16128, 65535-(262143>>cut13), wt332);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(3+16*c28)+(ptrdiff_t)16128, 65535-(262143>>cut13), wt333);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(4+16*c28)+(ptrdiff_t)16128, 65535-(262143>>cut13), wt334);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(5+16*c28)+(ptrdiff_t)16128, 65535-(262143>>cut13), wt335);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(6+16*c28)+(ptrdiff_t)16128, 65535-(262143>>cut13), wt336);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(7+16*c28)+(ptrdiff_t)16128, 65535-(262143>>cut13), wt337);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(8+16*c28)+(ptrdiff_t)16128, 65535-(262143>>cut13), wt338);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(9+16*c28)+(ptrdiff_t)16128, 65535-(262143>>cut13), wt339);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(10+16*c28)+(ptrdiff_t)16128, 65535-(262143>>cut13), wt340);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(11+16*c28)+(ptrdiff_t)16128, 65535-(262143>>cut13), wt341);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(12+16*c28)+(ptrdiff_t)16128, 65535-(262143>>cut13), wt342);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(13+16*c28)+(ptrdiff_t)16128, 65535-(262143>>cut13), wt343);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(14+16*c28)+(ptrdiff_t)16128, 65535-(262143>>cut13), wt344);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(15+16*c28)+(ptrdiff_t)16128, 65535-(262143>>cut13), wt345);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l31+4*cut13+24*(16+16*c28)+(ptrdiff_t)16128, 65535-(262143>>cut13), wt346);
}
}
}
} else {
ptrdiff_t k86 = 112;
ptrdiff_t l30 = (size_t)(0+k86)/6;
ptrdiff_t cut12 = (size_t)(0+k86)%6;
__m512 sum217 = _mm512_maskz_loadu_ps(65535, biasPtr8+512*i35+4*k86);
__m512i pmMul19 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd19 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo18 = _mm512_loadu_ps(bnPtr12+(ptrdiff_t)8*(k86+128*i35));
__m512 masHi18 = _mm512_maskz_loadu_ps(65535, bnPtr12+(ptrdiff_t)8*(k86+128*i35)+(ptrdiff_t)64);
__m512 postMul19 = _mm512_permutex2var_ps(masLo18, pmMul19, masHi18);
__m512 postAdd17 = _mm512_permutex2var_ps(masLo18, pmAdd19, masHi18);
sum217 = _mm512_fmadd_ps(sum217, postMul19, postAdd17);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*0+(ptrdiff_t)0, 63>>cut12, sum217);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*0+(ptrdiff_t)5376, 4032>>cut12, sum217);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*0+(ptrdiff_t)10752, 258048>>cut12, sum217);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+8*0+(ptrdiff_t)16128, 65535-(262143>>cut12), sum217);
ptrdiff_t c26 = 0;
for (; c26 != 14; ++c26) {
__m512 wt299 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k86+64*c26+(ptrdiff_t)0);
__m512 wt300 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k86+64*c26+(ptrdiff_t)896);
__m512 wt301 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k86+64*c26+(ptrdiff_t)1792);
__m512 wt302 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k86+64*c26+(ptrdiff_t)2688);
__m512 wt303 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k86+64*c26+(ptrdiff_t)3584);
__m512 wt304 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k86+64*c26+(ptrdiff_t)4480);
__m512 wt305 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k86+64*c26+(ptrdiff_t)5376);
__m512 wt306 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k86+64*c26+(ptrdiff_t)6272);
__m512 wt307 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k86+64*c26+(ptrdiff_t)7168);
__m512 wt308 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k86+64*c26+(ptrdiff_t)8064);
__m512 wt309 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k86+64*c26+(ptrdiff_t)8960);
__m512 wt310 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k86+64*c26+(ptrdiff_t)9856);
__m512 wt311 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k86+64*c26+(ptrdiff_t)10752);
__m512 wt312 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k86+64*c26+(ptrdiff_t)11648);
__m512 wt313 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k86+64*c26+(ptrdiff_t)12544);
__m512 wt314 = _mm512_maskz_loadu_ps(65535, wtPtr8+114688*i35+896*k86+64*c26+(ptrdiff_t)13440);
__m512 tmp5829 = _mm512_unpacklo_ps(wt299, wt300);
__m512 tmp5830 = _mm512_unpackhi_ps(wt299, wt300);
__m512 tmp5831 = _mm512_unpacklo_ps(wt301, wt302);
__m512 tmp5832 = _mm512_unpackhi_ps(wt301, wt302);
__m512 tmp5833 = _mm512_unpacklo_ps(wt303, wt304);
__m512 tmp5834 = _mm512_unpackhi_ps(wt303, wt304);
__m512 tmp5835 = _mm512_unpacklo_ps(wt305, wt306);
__m512 tmp5836 = _mm512_unpackhi_ps(wt305, wt306);
__m512 tmp5837 = _mm512_unpacklo_ps(wt307, wt308);
__m512 tmp5838 = _mm512_unpackhi_ps(wt307, wt308);
__m512 tmp5839 = _mm512_unpacklo_ps(wt309, wt310);
__m512 tmp5840 = _mm512_unpackhi_ps(wt309, wt310);
__m512 tmp5841 = _mm512_unpacklo_ps(wt311, wt312);
__m512 tmp5842 = _mm512_unpackhi_ps(wt311, wt312);
__m512 tmp5843 = _mm512_unpacklo_ps(wt313, wt314);
__m512 tmp5844 = _mm512_unpackhi_ps(wt313, wt314);
__m512 tmp5845 = _mm512_shuffle_ps(tmp5829, tmp5831, 68);
__m512 tmp5846 = _mm512_shuffle_ps(tmp5829, tmp5831, 238);
__m512 tmp5847 = _mm512_shuffle_ps(tmp5830, tmp5832, 68);
__m512 tmp5848 = _mm512_shuffle_ps(tmp5830, tmp5832, 238);
__m512 tmp5849 = _mm512_shuffle_ps(tmp5833, tmp5835, 68);
__m512 tmp5850 = _mm512_shuffle_ps(tmp5833, tmp5835, 238);
__m512 tmp5851 = _mm512_shuffle_ps(tmp5834, tmp5836, 68);
__m512 tmp5852 = _mm512_shuffle_ps(tmp5834, tmp5836, 238);
__m512 tmp5853 = _mm512_shuffle_ps(tmp5837, tmp5839, 68);
__m512 tmp5854 = _mm512_shuffle_ps(tmp5837, tmp5839, 238);
__m512 tmp5855 = _mm512_shuffle_ps(tmp5838, tmp5840, 68);
__m512 tmp5856 = _mm512_shuffle_ps(tmp5838, tmp5840, 238);
__m512 tmp5857 = _mm512_shuffle_ps(tmp5841, tmp5843, 68);
__m512 tmp5858 = _mm512_shuffle_ps(tmp5841, tmp5843, 238);
__m512 tmp5859 = _mm512_shuffle_ps(tmp5842, tmp5844, 68);
__m512 tmp5860 = _mm512_shuffle_ps(tmp5842, tmp5844, 238);
__m512 tmp5861 = _mm512_shuffle_f32x4(tmp5845, tmp5849, 136);
__m512 tmp5862 = _mm512_shuffle_f32x4(tmp5845, tmp5849, 221);
__m512 tmp5863 = _mm512_shuffle_f32x4(tmp5846, tmp5850, 136);
__m512 tmp5864 = _mm512_shuffle_f32x4(tmp5846, tmp5850, 221);
__m512 tmp5865 = _mm512_shuffle_f32x4(tmp5847, tmp5851, 136);
__m512 tmp5866 = _mm512_shuffle_f32x4(tmp5847, tmp5851, 221);
__m512 tmp5867 = _mm512_shuffle_f32x4(tmp5848, tmp5852, 136);
__m512 tmp5868 = _mm512_shuffle_f32x4(tmp5848, tmp5852, 221);
__m512 tmp5869 = _mm512_shuffle_f32x4(tmp5853, tmp5857, 136);
__m512 tmp5870 = _mm512_shuffle_f32x4(tmp5853, tmp5857, 221);
__m512 tmp5871 = _mm512_shuffle_f32x4(tmp5854, tmp5858, 136);
__m512 tmp5872 = _mm512_shuffle_f32x4(tmp5854, tmp5858, 221);
__m512 tmp5873 = _mm512_shuffle_f32x4(tmp5855, tmp5859, 136);
__m512 tmp5874 = _mm512_shuffle_f32x4(tmp5855, tmp5859, 221);
__m512 tmp5875 = _mm512_shuffle_f32x4(tmp5856, tmp5860, 136);
__m512 tmp5876 = _mm512_shuffle_f32x4(tmp5856, tmp5860, 221);
wt299 = _mm512_shuffle_f32x4(tmp5861, tmp5869, 136);
wt307 = _mm512_shuffle_f32x4(tmp5861, tmp5869, 221);
wt300 = _mm512_shuffle_f32x4(tmp5863, tmp5871, 136);
wt308 = _mm512_shuffle_f32x4(tmp5863, tmp5871, 221);
wt301 = _mm512_shuffle_f32x4(tmp5865, tmp5873, 136);
wt309 = _mm512_shuffle_f32x4(tmp5865, tmp5873, 221);
wt302 = _mm512_shuffle_f32x4(tmp5867, tmp5875, 136);
wt310 = _mm512_shuffle_f32x4(tmp5867, tmp5875, 221);
wt303 = _mm512_shuffle_f32x4(tmp5862, tmp5870, 136);
wt311 = _mm512_shuffle_f32x4(tmp5862, tmp5870, 221);
wt304 = _mm512_shuffle_f32x4(tmp5864, tmp5872, 136);
wt312 = _mm512_shuffle_f32x4(tmp5864, tmp5872, 221);
wt305 = _mm512_shuffle_f32x4(tmp5866, tmp5874, 136);
wt313 = _mm512_shuffle_f32x4(tmp5866, tmp5874, 221);
wt306 = _mm512_shuffle_f32x4(tmp5868, tmp5876, 136);
wt314 = _mm512_shuffle_f32x4(tmp5868, tmp5876, 221);
wt299 = _mm512_mul_ps(wt299, postMul19);
wt300 = _mm512_mul_ps(wt300, postMul19);
wt301 = _mm512_mul_ps(wt301, postMul19);
wt302 = _mm512_mul_ps(wt302, postMul19);
wt303 = _mm512_mul_ps(wt303, postMul19);
wt304 = _mm512_mul_ps(wt304, postMul19);
wt305 = _mm512_mul_ps(wt305, postMul19);
wt306 = _mm512_mul_ps(wt306, postMul19);
wt307 = _mm512_mul_ps(wt307, postMul19);
wt308 = _mm512_mul_ps(wt308, postMul19);
wt309 = _mm512_mul_ps(wt309, postMul19);
wt310 = _mm512_mul_ps(wt310, postMul19);
wt311 = _mm512_mul_ps(wt311, postMul19);
wt312 = _mm512_mul_ps(wt312, postMul19);
wt313 = _mm512_mul_ps(wt313, postMul19);
wt314 = _mm512_mul_ps(wt314, postMul19);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(1+16*c26)+(ptrdiff_t)0, 63>>cut12, wt299);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(2+16*c26)+(ptrdiff_t)0, 63>>cut12, wt300);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(3+16*c26)+(ptrdiff_t)0, 63>>cut12, wt301);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(4+16*c26)+(ptrdiff_t)0, 63>>cut12, wt302);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(5+16*c26)+(ptrdiff_t)0, 63>>cut12, wt303);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(6+16*c26)+(ptrdiff_t)0, 63>>cut12, wt304);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(7+16*c26)+(ptrdiff_t)0, 63>>cut12, wt305);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(8+16*c26)+(ptrdiff_t)0, 63>>cut12, wt306);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(9+16*c26)+(ptrdiff_t)0, 63>>cut12, wt307);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(10+16*c26)+(ptrdiff_t)0, 63>>cut12, wt308);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(11+16*c26)+(ptrdiff_t)0, 63>>cut12, wt309);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(12+16*c26)+(ptrdiff_t)0, 63>>cut12, wt310);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(13+16*c26)+(ptrdiff_t)0, 63>>cut12, wt311);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(14+16*c26)+(ptrdiff_t)0, 63>>cut12, wt312);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(15+16*c26)+(ptrdiff_t)0, 63>>cut12, wt313);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(16+16*c26)+(ptrdiff_t)0, 63>>cut12, wt314);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(1+16*c26)+(ptrdiff_t)5376, 4032>>cut12, wt299);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(2+16*c26)+(ptrdiff_t)5376, 4032>>cut12, wt300);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(3+16*c26)+(ptrdiff_t)5376, 4032>>cut12, wt301);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(4+16*c26)+(ptrdiff_t)5376, 4032>>cut12, wt302);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(5+16*c26)+(ptrdiff_t)5376, 4032>>cut12, wt303);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(6+16*c26)+(ptrdiff_t)5376, 4032>>cut12, wt304);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(7+16*c26)+(ptrdiff_t)5376, 4032>>cut12, wt305);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(8+16*c26)+(ptrdiff_t)5376, 4032>>cut12, wt306);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(9+16*c26)+(ptrdiff_t)5376, 4032>>cut12, wt307);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(10+16*c26)+(ptrdiff_t)5376, 4032>>cut12, wt308);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(11+16*c26)+(ptrdiff_t)5376, 4032>>cut12, wt309);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(12+16*c26)+(ptrdiff_t)5376, 4032>>cut12, wt310);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(13+16*c26)+(ptrdiff_t)5376, 4032>>cut12, wt311);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(14+16*c26)+(ptrdiff_t)5376, 4032>>cut12, wt312);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(15+16*c26)+(ptrdiff_t)5376, 4032>>cut12, wt313);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(16+16*c26)+(ptrdiff_t)5376, 4032>>cut12, wt314);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(1+16*c26)+(ptrdiff_t)10752, 258048>>cut12, wt299);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(2+16*c26)+(ptrdiff_t)10752, 258048>>cut12, wt300);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(3+16*c26)+(ptrdiff_t)10752, 258048>>cut12, wt301);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(4+16*c26)+(ptrdiff_t)10752, 258048>>cut12, wt302);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(5+16*c26)+(ptrdiff_t)10752, 258048>>cut12, wt303);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(6+16*c26)+(ptrdiff_t)10752, 258048>>cut12, wt304);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(7+16*c26)+(ptrdiff_t)10752, 258048>>cut12, wt305);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(8+16*c26)+(ptrdiff_t)10752, 258048>>cut12, wt306);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(9+16*c26)+(ptrdiff_t)10752, 258048>>cut12, wt307);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(10+16*c26)+(ptrdiff_t)10752, 258048>>cut12, wt308);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(11+16*c26)+(ptrdiff_t)10752, 258048>>cut12, wt309);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(12+16*c26)+(ptrdiff_t)10752, 258048>>cut12, wt310);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(13+16*c26)+(ptrdiff_t)10752, 258048>>cut12, wt311);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(14+16*c26)+(ptrdiff_t)10752, 258048>>cut12, wt312);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(15+16*c26)+(ptrdiff_t)10752, 258048>>cut12, wt313);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+24*(16+16*c26)+(ptrdiff_t)10752, 258048>>cut12, wt314);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+8*(1+16*c26)+(ptrdiff_t)16128, 65535-(262143>>cut12), wt299);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+8*(2+16*c26)+(ptrdiff_t)16128, 65535-(262143>>cut12), wt300);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+8*(3+16*c26)+(ptrdiff_t)16128, 65535-(262143>>cut12), wt301);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+8*(4+16*c26)+(ptrdiff_t)16128, 65535-(262143>>cut12), wt302);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+8*(5+16*c26)+(ptrdiff_t)16128, 65535-(262143>>cut12), wt303);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+8*(6+16*c26)+(ptrdiff_t)16128, 65535-(262143>>cut12), wt304);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+8*(7+16*c26)+(ptrdiff_t)16128, 65535-(262143>>cut12), wt305);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+8*(8+16*c26)+(ptrdiff_t)16128, 65535-(262143>>cut12), wt306);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+8*(9+16*c26)+(ptrdiff_t)16128, 65535-(262143>>cut12), wt307);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+8*(10+16*c26)+(ptrdiff_t)16128, 65535-(262143>>cut12), wt308);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+8*(11+16*c26)+(ptrdiff_t)16128, 65535-(262143>>cut12), wt309);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+8*(12+16*c26)+(ptrdiff_t)16128, 65535-(262143>>cut12), wt310);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+8*(13+16*c26)+(ptrdiff_t)16128, 65535-(262143>>cut12), wt311);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+8*(14+16*c26)+(ptrdiff_t)16128, 65535-(262143>>cut12), wt312);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+8*(15+16*c26)+(ptrdiff_t)16128, 65535-(262143>>cut12), wt313);
_mm512_mask_storeu_ps(arranged11+115200*i35+5400*l30+4*cut12+8*(16+16*c26)+(ptrdiff_t)16128, 65535-(262143>>cut12), wt314);
}
}
}
}
}

static void DenseNet121OneArrangeWts6(DenseNet121ThreaderTeam1* team38, char** tensors49) {
DenseNet121ThreaderTask1 task53;
task53.callee1 = DenseNet121OneArrangeWts6Callee1;
task53.any1 = tensors49;
task53.nd1 = 3;
task53.hull1[0] = 2;
task53.hull1[1] = 1;
task53.hull1[2] = 1;
DenseNet121ThreaderDo1(team38, &task53);
}

static void DenseNet121OneArrangeDats6Callee1(DenseNet121ThreaderTask1* task54, int64_t* pt32) {
char** tensors52 = task54->any1;
ptrdiff_t c29 = pt32[1];
char*restrict datPtr15 = tensors52[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)2824192*0;
char*restrict bnPtr13 = tensors52[1]+(ptrdiff_t)8*((ptrdiff_t)835*0+(ptrdiff_t)224*0);
char*restrict arranged12 = tensors52[2]+(ptrdiff_t)10474240*0+(ptrdiff_t)2809856*0;
ptrdiff_t ii17 = 1;
for (ptrdiff_t i36 = 0; i36 < ii17; ++i36) {
ptrdiff_t j27 = 1*c29;
ptrdiff_t jj36 = j27+0;
for (; j27 != 49; ++j27) {
ptrdiff_t k88 = 0;
ptrdiff_t kk35 = k88+224;
for (; k88 < kk35; ++k88) {
__m512 dat1319 = _mm512_maskz_loadu_ps(65535, datPtr15+2824192*i36+256*j27+12608*k88+(ptrdiff_t)0);
__m512 dat1320 = _mm512_maskz_loadu_ps(65535, datPtr15+2824192*i36+256*j27+12608*k88+(ptrdiff_t)64);
__m512 dat1321 = _mm512_maskz_loadu_ps(65535, datPtr15+2824192*i36+256*j27+12608*k88+(ptrdiff_t)128);
__m512 dat1322 = _mm512_maskz_loadu_ps(65535, datPtr15+2824192*i36+256*j27+12608*k88+(ptrdiff_t)192);
__m512 bnMul6 = _mm512_set1_ps(((float*)bnPtr13+(ptrdiff_t)2*(k88+224*i36))[0]);
__m512 bnAdd6 = _mm512_set1_ps(((float*)bnPtr13+(ptrdiff_t)2*(k88+224*i36))[1]);
dat1319 = _mm512_fmadd_ps(dat1319, bnMul6, bnAdd6);
dat1320 = _mm512_fmadd_ps(dat1320, bnMul6, bnAdd6);
dat1321 = _mm512_fmadd_ps(dat1321, bnMul6, bnAdd6);
dat1322 = _mm512_fmadd_ps(dat1322, bnMul6, bnAdd6);
dat1319 = _mm512_max_ps(_mm512_setzero_ps(), dat1319);
dat1320 = _mm512_max_ps(_mm512_setzero_ps(), dat1320);
dat1321 = _mm512_max_ps(_mm512_setzero_ps(), dat1321);
dat1322 = _mm512_max_ps(_mm512_setzero_ps(), dat1322);
_mm512_mask_storeu_ps(arranged12+2809856*i36+57344*j27+256*k88+(ptrdiff_t)0, 65535, dat1319);
_mm512_mask_storeu_ps(arranged12+2809856*i36+57344*j27+256*k88+(ptrdiff_t)64, 65535, dat1320);
_mm512_mask_storeu_ps(arranged12+2809856*i36+57344*j27+256*k88+(ptrdiff_t)128, 65535, dat1321);
_mm512_mask_storeu_ps(arranged12+2809856*i36+57344*j27+256*k88+(ptrdiff_t)192, 65535, dat1322);
}
if (j27 >= jj36) goto next6;
}
next6:;
}
}

static void DenseNet121OneArrangeDats6(DenseNet121ThreaderTeam1* team39, char** tensors51) {
DenseNet121ThreaderTask1 task55;
task55.callee1 = DenseNet121OneArrangeDats6Callee1;
task55.any1 = tensors51;
task55.nd1 = 4;
task55.hull1[0] = 1;
task55.hull1[1] = 49;
task55.hull1[2] = 1;
task55.hull1[3] = 1;
DenseNet121ThreaderDo1(team39, &task55);
}

static void DenseNet121OneApply6Callee1(DenseNet121ThreaderTask1* task56, int64_t* pt33) {
void** pair14 = task56->any1;
char** tensors54 = pair14[0];
ptrdiff_t e14 = 0;
ptrdiff_t g15 = 0;
ptrdiff_t d10 = pt33[1];
ptrdiff_t w38 = pt33[0];
char*restrict arrangedWts6 = tensors54[0]+428032*e14+(ptrdiff_t)115200*1*g15;
char*restrict arrangedDats6 = tensors54[1]+10474240*e14+(ptrdiff_t)2809856*1*g15;
char*restrict datPtr16 = tensors54[2]+(ptrdiff_t)1613824*1*g15;
ptrdiff_t ii18 = 1;
for (ptrdiff_t i37 = 0; i37 < ii18; ++i37) {
ptrdiff_t j28 = 1*d10;
ptrdiff_t jj37 = j28+0;
for (; j28 != 49; ++j28) {
ptrdiff_t k89 = 3*w38;
ptrdiff_t kk36 = k89+(w38 < 6 ? 2 : 3);
for (; k89 != 21; ++k89) {
ptrdiff_t s22 = -1;
__m512 sum220 = _mm512_set1_ps(*(float*)(arrangedWts6+115200*i37+5400*k89+24*s22+(ptrdiff_t)24));
__m512 sum224 = _mm512_set1_ps(*(float*)(arrangedWts6+115200*i37+5400*k89+24*s22+(ptrdiff_t)28));
__m512 sum228 = _mm512_set1_ps(*(float*)(arrangedWts6+115200*i37+5400*k89+24*s22+(ptrdiff_t)32));
__m512 sum232 = _mm512_set1_ps(*(float*)(arrangedWts6+115200*i37+5400*k89+24*s22+(ptrdiff_t)36));
__m512 sum236 = _mm512_set1_ps(*(float*)(arrangedWts6+115200*i37+5400*k89+24*s22+(ptrdiff_t)40));
__m512 sum240 = _mm512_set1_ps(*(float*)(arrangedWts6+115200*i37+5400*k89+24*s22+(ptrdiff_t)44));
__m512 sum221 = sum220;
__m512 sum222 = sum220;
__m512 sum223 = sum220;
__m512 sum225 = sum224;
__m512 sum226 = sum224;
__m512 sum227 = sum224;
__m512 sum229 = sum228;
__m512 sum230 = sum228;
__m512 sum231 = sum228;
__m512 sum233 = sum232;
__m512 sum234 = sum232;
__m512 sum235 = sum232;
__m512 sum237 = sum236;
__m512 sum238 = sum236;
__m512 sum239 = sum236;
__m512 sum241 = sum240;
__m512 sum242 = sum240;
__m512 sum243 = sum240;
for (s22 = 0; s22 < 224; ++s22) {
__m512 dat1323 = _mm512_loadu_ps(arrangedDats6+2809856*i37+57344*j28+256*s22+(ptrdiff_t)0);
__m512 dat1324 = _mm512_loadu_ps(arrangedDats6+2809856*i37+57344*j28+256*s22+(ptrdiff_t)64);
__m512 dat1325 = _mm512_loadu_ps(arrangedDats6+2809856*i37+57344*j28+256*s22+(ptrdiff_t)128);
__m512 dat1326 = _mm512_loadu_ps(arrangedDats6+2809856*i37+57344*j28+256*s22+(ptrdiff_t)192);
__m512 wt347 = _mm512_set1_ps(*(float*)(arrangedWts6+115200*i37+5400*k89+24*s22+(ptrdiff_t)24));
sum220 = _mm512_fmadd_ps(wt347, dat1323, sum220);
sum221 = _mm512_fmadd_ps(wt347, dat1324, sum221);
sum222 = _mm512_fmadd_ps(wt347, dat1325, sum222);
sum223 = _mm512_fmadd_ps(wt347, dat1326, sum223);
__m512 wt348 = _mm512_set1_ps(*(float*)(arrangedWts6+115200*i37+5400*k89+24*s22+(ptrdiff_t)28));
sum224 = _mm512_fmadd_ps(wt348, dat1323, sum224);
sum225 = _mm512_fmadd_ps(wt348, dat1324, sum225);
sum226 = _mm512_fmadd_ps(wt348, dat1325, sum226);
sum227 = _mm512_fmadd_ps(wt348, dat1326, sum227);
__m512 wt349 = _mm512_set1_ps(*(float*)(arrangedWts6+115200*i37+5400*k89+24*s22+(ptrdiff_t)32));
sum228 = _mm512_fmadd_ps(wt349, dat1323, sum228);
sum229 = _mm512_fmadd_ps(wt349, dat1324, sum229);
sum230 = _mm512_fmadd_ps(wt349, dat1325, sum230);
sum231 = _mm512_fmadd_ps(wt349, dat1326, sum231);
__m512 wt350 = _mm512_set1_ps(*(float*)(arrangedWts6+115200*i37+5400*k89+24*s22+(ptrdiff_t)36));
sum232 = _mm512_fmadd_ps(wt350, dat1323, sum232);
sum233 = _mm512_fmadd_ps(wt350, dat1324, sum233);
sum234 = _mm512_fmadd_ps(wt350, dat1325, sum234);
sum235 = _mm512_fmadd_ps(wt350, dat1326, sum235);
__m512 wt351 = _mm512_set1_ps(*(float*)(arrangedWts6+115200*i37+5400*k89+24*s22+(ptrdiff_t)40));
sum236 = _mm512_fmadd_ps(wt351, dat1323, sum236);
sum237 = _mm512_fmadd_ps(wt351, dat1324, sum237);
sum238 = _mm512_fmadd_ps(wt351, dat1325, sum238);
sum239 = _mm512_fmadd_ps(wt351, dat1326, sum239);
__m512 wt352 = _mm512_set1_ps(*(float*)(arrangedWts6+115200*i37+5400*k89+24*s22+(ptrdiff_t)44));
sum240 = _mm512_fmadd_ps(wt352, dat1323, sum240);
sum241 = _mm512_fmadd_ps(wt352, dat1324, sum241);
sum242 = _mm512_fmadd_ps(wt352, dat1325, sum242);
sum243 = _mm512_fmadd_ps(wt352, dat1326, sum243);
}
sum220 = _mm512_max_ps(_mm512_setzero_ps(), sum220);
sum221 = _mm512_max_ps(_mm512_setzero_ps(), sum221);
sum222 = _mm512_max_ps(_mm512_setzero_ps(), sum222);
sum223 = _mm512_max_ps(_mm512_setzero_ps(), sum223);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)0, 65535, sum220);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)64, 65535, sum221);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)128, 65535, sum222);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)192, 65535, sum223);
sum224 = _mm512_max_ps(_mm512_setzero_ps(), sum224);
sum225 = _mm512_max_ps(_mm512_setzero_ps(), sum225);
sum226 = _mm512_max_ps(_mm512_setzero_ps(), sum226);
sum227 = _mm512_max_ps(_mm512_setzero_ps(), sum227);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)12608, 65535, sum224);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)12672, 65535, sum225);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)12736, 65535, sum226);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)12800, 65535, sum227);
sum228 = _mm512_max_ps(_mm512_setzero_ps(), sum228);
sum229 = _mm512_max_ps(_mm512_setzero_ps(), sum229);
sum230 = _mm512_max_ps(_mm512_setzero_ps(), sum230);
sum231 = _mm512_max_ps(_mm512_setzero_ps(), sum231);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)25216, 65535, sum228);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)25280, 65535, sum229);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)25344, 65535, sum230);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)25408, 65535, sum231);
sum232 = _mm512_max_ps(_mm512_setzero_ps(), sum232);
sum233 = _mm512_max_ps(_mm512_setzero_ps(), sum233);
sum234 = _mm512_max_ps(_mm512_setzero_ps(), sum234);
sum235 = _mm512_max_ps(_mm512_setzero_ps(), sum235);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)37824, 65535, sum232);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)37888, 65535, sum233);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)37952, 65535, sum234);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)38016, 65535, sum235);
sum236 = _mm512_max_ps(_mm512_setzero_ps(), sum236);
sum237 = _mm512_max_ps(_mm512_setzero_ps(), sum237);
sum238 = _mm512_max_ps(_mm512_setzero_ps(), sum238);
sum239 = _mm512_max_ps(_mm512_setzero_ps(), sum239);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)50432, 65535, sum236);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)50496, 65535, sum237);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)50560, 65535, sum238);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)50624, 65535, sum239);
sum240 = _mm512_max_ps(_mm512_setzero_ps(), sum240);
sum241 = _mm512_max_ps(_mm512_setzero_ps(), sum241);
sum242 = _mm512_max_ps(_mm512_setzero_ps(), sum242);
sum243 = _mm512_max_ps(_mm512_setzero_ps(), sum243);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)63040, 65535, sum240);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)63104, 65535, sum241);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)63168, 65535, sum242);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)63232, 65535, sum243);
if (k89 >= kk36) return;
}
ptrdiff_t s23 = -1;
__m512 sum244 = _mm512_set1_ps(*(float*)(arrangedWts6+115200*i37+5400*k89+8*s23+(ptrdiff_t)8));
__m512 sum248 = _mm512_set1_ps(*(float*)(arrangedWts6+115200*i37+5400*k89+8*s23+(ptrdiff_t)12));
__m512 sum245 = sum244;
__m512 sum246 = sum244;
__m512 sum247 = sum244;
__m512 sum249 = sum248;
__m512 sum250 = sum248;
__m512 sum251 = sum248;
for (s23 = 0; s23 < 224; ++s23) {
__m512 dat1327 = _mm512_loadu_ps(arrangedDats6+2809856*i37+57344*j28+256*s23+(ptrdiff_t)0);
__m512 dat1328 = _mm512_loadu_ps(arrangedDats6+2809856*i37+57344*j28+256*s23+(ptrdiff_t)64);
__m512 dat1329 = _mm512_loadu_ps(arrangedDats6+2809856*i37+57344*j28+256*s23+(ptrdiff_t)128);
__m512 dat1330 = _mm512_loadu_ps(arrangedDats6+2809856*i37+57344*j28+256*s23+(ptrdiff_t)192);
__m512 wt353 = _mm512_set1_ps(*(float*)(arrangedWts6+115200*i37+5400*k89+8*s23+(ptrdiff_t)8));
sum244 = _mm512_fmadd_ps(wt353, dat1327, sum244);
sum245 = _mm512_fmadd_ps(wt353, dat1328, sum245);
sum246 = _mm512_fmadd_ps(wt353, dat1329, sum246);
sum247 = _mm512_fmadd_ps(wt353, dat1330, sum247);
__m512 wt354 = _mm512_set1_ps(*(float*)(arrangedWts6+115200*i37+5400*k89+8*s23+(ptrdiff_t)12));
sum248 = _mm512_fmadd_ps(wt354, dat1327, sum248);
sum249 = _mm512_fmadd_ps(wt354, dat1328, sum249);
sum250 = _mm512_fmadd_ps(wt354, dat1329, sum250);
sum251 = _mm512_fmadd_ps(wt354, dat1330, sum251);
}
sum244 = _mm512_max_ps(_mm512_setzero_ps(), sum244);
sum245 = _mm512_max_ps(_mm512_setzero_ps(), sum245);
sum246 = _mm512_max_ps(_mm512_setzero_ps(), sum246);
sum247 = _mm512_max_ps(_mm512_setzero_ps(), sum247);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)0, 65535, sum244);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)64, 65535, sum245);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)128, 65535, sum246);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)192, 65535, sum247);
sum248 = _mm512_max_ps(_mm512_setzero_ps(), sum248);
sum249 = _mm512_max_ps(_mm512_setzero_ps(), sum249);
sum250 = _mm512_max_ps(_mm512_setzero_ps(), sum250);
sum251 = _mm512_max_ps(_mm512_setzero_ps(), sum251);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)12608, 65535, sum248);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)12672, 65535, sum249);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)12736, 65535, sum250);
_mm512_mask_storeu_ps(datPtr16+1613824*i37+256*j28+75648*k89+(ptrdiff_t)12800, 65535, sum251);
if (j28 >= jj37) return;
}
}
}

static void DenseNet121OneApply6(DenseNet121ThreaderTeam1* team40, char** tensors53) {
void* pair13[] = {tensors53, 0};
DenseNet121ThreaderTask1 task57;
task57.callee1 = DenseNet121OneApply6Callee1;
task57.any1 = pair13;
task57.nd1 = 3;
task57.hull1[0] = 7;
task57.hull1[1] = 49;
task57.hull1[2] = 1;
DenseNet121ThreaderDo1(team40, &task57);
}

static void DenseNet121OneArrangeWts7Callee1(DenseNet121ThreaderTask1* task58, int64_t* pt34) {
char** tensors56 = task58->any1;
ptrdiff_t b51 = pt34[0];
char*restrict wtPtr9 = tensors56[0]+(ptrdiff_t)3340*0+(ptrdiff_t)131072*0;
char*restrict biasPtr9 = tensors56[1]+(ptrdiff_t)512*0;
char*restrict arranged13 = tensors56[2]+(ptrdiff_t)428032*0+(ptrdiff_t)131584*0;
ptrdiff_t ii19 = 1;
for (ptrdiff_t i39 = 0; i39 < ii19; ++i39) {
ptrdiff_t j29 = 2*b51;
ptrdiff_t jj38 = j29+2;
for (; j29 < jj38; ++j29) {
if (j29 < 7) {
ptrdiff_t k91 = 0+16*(j29-0);
ptrdiff_t l33 = (size_t)(0+k91)/6;
ptrdiff_t cut15 = (size_t)(0+k91)%6;
switch (cut15) {
case 0:;
case 2: {
__m512 sum253 = _mm512_maskz_loadu_ps(65535, biasPtr9+512*i39+4*k91);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*0+(ptrdiff_t)0, 63>>cut15, sum253);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*0+(ptrdiff_t)6144, 4032>>cut15, sum253);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*0+(ptrdiff_t)12288, 65535-(4095>>cut15), sum253);
ptrdiff_t c31 = 0;
for (; c31 != 16; ++c31) {
__m512 wt371 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c31+(ptrdiff_t)0);
__m512 wt372 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c31+(ptrdiff_t)1024);
__m512 wt373 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c31+(ptrdiff_t)2048);
__m512 wt374 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c31+(ptrdiff_t)3072);
__m512 wt375 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c31+(ptrdiff_t)4096);
__m512 wt376 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c31+(ptrdiff_t)5120);
__m512 wt377 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c31+(ptrdiff_t)6144);
__m512 wt378 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c31+(ptrdiff_t)7168);
__m512 wt379 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c31+(ptrdiff_t)8192);
__m512 wt380 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c31+(ptrdiff_t)9216);
__m512 wt381 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c31+(ptrdiff_t)10240);
__m512 wt382 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c31+(ptrdiff_t)11264);
__m512 wt383 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c31+(ptrdiff_t)12288);
__m512 wt384 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c31+(ptrdiff_t)13312);
__m512 wt385 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c31+(ptrdiff_t)14336);
__m512 wt386 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c31+(ptrdiff_t)15360);
__m512 tmp5877 = _mm512_unpacklo_ps(wt371, wt372);
__m512 tmp5878 = _mm512_unpackhi_ps(wt371, wt372);
__m512 tmp5879 = _mm512_unpacklo_ps(wt373, wt374);
__m512 tmp5880 = _mm512_unpackhi_ps(wt373, wt374);
__m512 tmp5881 = _mm512_unpacklo_ps(wt375, wt376);
__m512 tmp5882 = _mm512_unpackhi_ps(wt375, wt376);
__m512 tmp5883 = _mm512_unpacklo_ps(wt377, wt378);
__m512 tmp5884 = _mm512_unpackhi_ps(wt377, wt378);
__m512 tmp5885 = _mm512_unpacklo_ps(wt379, wt380);
__m512 tmp5886 = _mm512_unpackhi_ps(wt379, wt380);
__m512 tmp5887 = _mm512_unpacklo_ps(wt381, wt382);
__m512 tmp5888 = _mm512_unpackhi_ps(wt381, wt382);
__m512 tmp5889 = _mm512_unpacklo_ps(wt383, wt384);
__m512 tmp5890 = _mm512_unpackhi_ps(wt383, wt384);
__m512 tmp5891 = _mm512_unpacklo_ps(wt385, wt386);
__m512 tmp5892 = _mm512_unpackhi_ps(wt385, wt386);
__m512 tmp5893 = _mm512_shuffle_ps(tmp5877, tmp5879, 68);
__m512 tmp5894 = _mm512_shuffle_ps(tmp5877, tmp5879, 238);
__m512 tmp5895 = _mm512_shuffle_ps(tmp5878, tmp5880, 68);
__m512 tmp5896 = _mm512_shuffle_ps(tmp5878, tmp5880, 238);
__m512 tmp5897 = _mm512_shuffle_ps(tmp5881, tmp5883, 68);
__m512 tmp5898 = _mm512_shuffle_ps(tmp5881, tmp5883, 238);
__m512 tmp5899 = _mm512_shuffle_ps(tmp5882, tmp5884, 68);
__m512 tmp5900 = _mm512_shuffle_ps(tmp5882, tmp5884, 238);
__m512 tmp5901 = _mm512_shuffle_ps(tmp5885, tmp5887, 68);
__m512 tmp5902 = _mm512_shuffle_ps(tmp5885, tmp5887, 238);
__m512 tmp5903 = _mm512_shuffle_ps(tmp5886, tmp5888, 68);
__m512 tmp5904 = _mm512_shuffle_ps(tmp5886, tmp5888, 238);
__m512 tmp5905 = _mm512_shuffle_ps(tmp5889, tmp5891, 68);
__m512 tmp5906 = _mm512_shuffle_ps(tmp5889, tmp5891, 238);
__m512 tmp5907 = _mm512_shuffle_ps(tmp5890, tmp5892, 68);
__m512 tmp5908 = _mm512_shuffle_ps(tmp5890, tmp5892, 238);
__m512 tmp5909 = _mm512_shuffle_f32x4(tmp5893, tmp5897, 136);
__m512 tmp5910 = _mm512_shuffle_f32x4(tmp5893, tmp5897, 221);
__m512 tmp5911 = _mm512_shuffle_f32x4(tmp5894, tmp5898, 136);
__m512 tmp5912 = _mm512_shuffle_f32x4(tmp5894, tmp5898, 221);
__m512 tmp5913 = _mm512_shuffle_f32x4(tmp5895, tmp5899, 136);
__m512 tmp5914 = _mm512_shuffle_f32x4(tmp5895, tmp5899, 221);
__m512 tmp5915 = _mm512_shuffle_f32x4(tmp5896, tmp5900, 136);
__m512 tmp5916 = _mm512_shuffle_f32x4(tmp5896, tmp5900, 221);
__m512 tmp5917 = _mm512_shuffle_f32x4(tmp5901, tmp5905, 136);
__m512 tmp5918 = _mm512_shuffle_f32x4(tmp5901, tmp5905, 221);
__m512 tmp5919 = _mm512_shuffle_f32x4(tmp5902, tmp5906, 136);
__m512 tmp5920 = _mm512_shuffle_f32x4(tmp5902, tmp5906, 221);
__m512 tmp5921 = _mm512_shuffle_f32x4(tmp5903, tmp5907, 136);
__m512 tmp5922 = _mm512_shuffle_f32x4(tmp5903, tmp5907, 221);
__m512 tmp5923 = _mm512_shuffle_f32x4(tmp5904, tmp5908, 136);
__m512 tmp5924 = _mm512_shuffle_f32x4(tmp5904, tmp5908, 221);
wt371 = _mm512_shuffle_f32x4(tmp5909, tmp5917, 136);
wt379 = _mm512_shuffle_f32x4(tmp5909, tmp5917, 221);
wt372 = _mm512_shuffle_f32x4(tmp5911, tmp5919, 136);
wt380 = _mm512_shuffle_f32x4(tmp5911, tmp5919, 221);
wt373 = _mm512_shuffle_f32x4(tmp5913, tmp5921, 136);
wt381 = _mm512_shuffle_f32x4(tmp5913, tmp5921, 221);
wt374 = _mm512_shuffle_f32x4(tmp5915, tmp5923, 136);
wt382 = _mm512_shuffle_f32x4(tmp5915, tmp5923, 221);
wt375 = _mm512_shuffle_f32x4(tmp5910, tmp5918, 136);
wt383 = _mm512_shuffle_f32x4(tmp5910, tmp5918, 221);
wt376 = _mm512_shuffle_f32x4(tmp5912, tmp5920, 136);
wt384 = _mm512_shuffle_f32x4(tmp5912, tmp5920, 221);
wt377 = _mm512_shuffle_f32x4(tmp5914, tmp5922, 136);
wt385 = _mm512_shuffle_f32x4(tmp5914, tmp5922, 221);
wt378 = _mm512_shuffle_f32x4(tmp5916, tmp5924, 136);
wt386 = _mm512_shuffle_f32x4(tmp5916, tmp5924, 221);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(1+16*c31)+(ptrdiff_t)0, 63>>cut15, wt371);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(2+16*c31)+(ptrdiff_t)0, 63>>cut15, wt372);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(3+16*c31)+(ptrdiff_t)0, 63>>cut15, wt373);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(4+16*c31)+(ptrdiff_t)0, 63>>cut15, wt374);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(5+16*c31)+(ptrdiff_t)0, 63>>cut15, wt375);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(6+16*c31)+(ptrdiff_t)0, 63>>cut15, wt376);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(7+16*c31)+(ptrdiff_t)0, 63>>cut15, wt377);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(8+16*c31)+(ptrdiff_t)0, 63>>cut15, wt378);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(9+16*c31)+(ptrdiff_t)0, 63>>cut15, wt379);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(10+16*c31)+(ptrdiff_t)0, 63>>cut15, wt380);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(11+16*c31)+(ptrdiff_t)0, 63>>cut15, wt381);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(12+16*c31)+(ptrdiff_t)0, 63>>cut15, wt382);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(13+16*c31)+(ptrdiff_t)0, 63>>cut15, wt383);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(14+16*c31)+(ptrdiff_t)0, 63>>cut15, wt384);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(15+16*c31)+(ptrdiff_t)0, 63>>cut15, wt385);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(16+16*c31)+(ptrdiff_t)0, 63>>cut15, wt386);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(1+16*c31)+(ptrdiff_t)6144, 4032>>cut15, wt371);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(2+16*c31)+(ptrdiff_t)6144, 4032>>cut15, wt372);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(3+16*c31)+(ptrdiff_t)6144, 4032>>cut15, wt373);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(4+16*c31)+(ptrdiff_t)6144, 4032>>cut15, wt374);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(5+16*c31)+(ptrdiff_t)6144, 4032>>cut15, wt375);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(6+16*c31)+(ptrdiff_t)6144, 4032>>cut15, wt376);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(7+16*c31)+(ptrdiff_t)6144, 4032>>cut15, wt377);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(8+16*c31)+(ptrdiff_t)6144, 4032>>cut15, wt378);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(9+16*c31)+(ptrdiff_t)6144, 4032>>cut15, wt379);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(10+16*c31)+(ptrdiff_t)6144, 4032>>cut15, wt380);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(11+16*c31)+(ptrdiff_t)6144, 4032>>cut15, wt381);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(12+16*c31)+(ptrdiff_t)6144, 4032>>cut15, wt382);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(13+16*c31)+(ptrdiff_t)6144, 4032>>cut15, wt383);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(14+16*c31)+(ptrdiff_t)6144, 4032>>cut15, wt384);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(15+16*c31)+(ptrdiff_t)6144, 4032>>cut15, wt385);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(16+16*c31)+(ptrdiff_t)6144, 4032>>cut15, wt386);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(1+16*c31)+(ptrdiff_t)12288, 65535-(4095>>cut15), wt371);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(2+16*c31)+(ptrdiff_t)12288, 65535-(4095>>cut15), wt372);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(3+16*c31)+(ptrdiff_t)12288, 65535-(4095>>cut15), wt373);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(4+16*c31)+(ptrdiff_t)12288, 65535-(4095>>cut15), wt374);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(5+16*c31)+(ptrdiff_t)12288, 65535-(4095>>cut15), wt375);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(6+16*c31)+(ptrdiff_t)12288, 65535-(4095>>cut15), wt376);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(7+16*c31)+(ptrdiff_t)12288, 65535-(4095>>cut15), wt377);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(8+16*c31)+(ptrdiff_t)12288, 65535-(4095>>cut15), wt378);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(9+16*c31)+(ptrdiff_t)12288, 65535-(4095>>cut15), wt379);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(10+16*c31)+(ptrdiff_t)12288, 65535-(4095>>cut15), wt380);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(11+16*c31)+(ptrdiff_t)12288, 65535-(4095>>cut15), wt381);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(12+16*c31)+(ptrdiff_t)12288, 65535-(4095>>cut15), wt382);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(13+16*c31)+(ptrdiff_t)12288, 65535-(4095>>cut15), wt383);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(14+16*c31)+(ptrdiff_t)12288, 65535-(4095>>cut15), wt384);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(15+16*c31)+(ptrdiff_t)12288, 65535-(4095>>cut15), wt385);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(16+16*c31)+(ptrdiff_t)12288, 65535-(4095>>cut15), wt386);
}
break;
}
default: {
cut15 = 4;
__m512 sum254 = _mm512_maskz_loadu_ps(65535, biasPtr9+512*i39+4*k91);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*0+(ptrdiff_t)0, 63>>cut15, sum254);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*0+(ptrdiff_t)6144, 4032>>cut15, sum254);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*0+(ptrdiff_t)12288, 258048>>cut15, sum254);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*0+(ptrdiff_t)18432, 65535-(262143>>cut15), sum254);
ptrdiff_t c32 = 0;
for (; c32 != 16; ++c32) {
__m512 wt387 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c32+(ptrdiff_t)0);
__m512 wt388 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c32+(ptrdiff_t)1024);
__m512 wt389 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c32+(ptrdiff_t)2048);
__m512 wt390 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c32+(ptrdiff_t)3072);
__m512 wt391 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c32+(ptrdiff_t)4096);
__m512 wt392 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c32+(ptrdiff_t)5120);
__m512 wt393 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c32+(ptrdiff_t)6144);
__m512 wt394 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c32+(ptrdiff_t)7168);
__m512 wt395 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c32+(ptrdiff_t)8192);
__m512 wt396 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c32+(ptrdiff_t)9216);
__m512 wt397 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c32+(ptrdiff_t)10240);
__m512 wt398 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c32+(ptrdiff_t)11264);
__m512 wt399 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c32+(ptrdiff_t)12288);
__m512 wt400 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c32+(ptrdiff_t)13312);
__m512 wt401 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c32+(ptrdiff_t)14336);
__m512 wt402 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k91+64*c32+(ptrdiff_t)15360);
__m512 tmp5925 = _mm512_unpacklo_ps(wt387, wt388);
__m512 tmp5926 = _mm512_unpackhi_ps(wt387, wt388);
__m512 tmp5927 = _mm512_unpacklo_ps(wt389, wt390);
__m512 tmp5928 = _mm512_unpackhi_ps(wt389, wt390);
__m512 tmp5929 = _mm512_unpacklo_ps(wt391, wt392);
__m512 tmp5930 = _mm512_unpackhi_ps(wt391, wt392);
__m512 tmp5931 = _mm512_unpacklo_ps(wt393, wt394);
__m512 tmp5932 = _mm512_unpackhi_ps(wt393, wt394);
__m512 tmp5933 = _mm512_unpacklo_ps(wt395, wt396);
__m512 tmp5934 = _mm512_unpackhi_ps(wt395, wt396);
__m512 tmp5935 = _mm512_unpacklo_ps(wt397, wt398);
__m512 tmp5936 = _mm512_unpackhi_ps(wt397, wt398);
__m512 tmp5937 = _mm512_unpacklo_ps(wt399, wt400);
__m512 tmp5938 = _mm512_unpackhi_ps(wt399, wt400);
__m512 tmp5939 = _mm512_unpacklo_ps(wt401, wt402);
__m512 tmp5940 = _mm512_unpackhi_ps(wt401, wt402);
__m512 tmp5941 = _mm512_shuffle_ps(tmp5925, tmp5927, 68);
__m512 tmp5942 = _mm512_shuffle_ps(tmp5925, tmp5927, 238);
__m512 tmp5943 = _mm512_shuffle_ps(tmp5926, tmp5928, 68);
__m512 tmp5944 = _mm512_shuffle_ps(tmp5926, tmp5928, 238);
__m512 tmp5945 = _mm512_shuffle_ps(tmp5929, tmp5931, 68);
__m512 tmp5946 = _mm512_shuffle_ps(tmp5929, tmp5931, 238);
__m512 tmp5947 = _mm512_shuffle_ps(tmp5930, tmp5932, 68);
__m512 tmp5948 = _mm512_shuffle_ps(tmp5930, tmp5932, 238);
__m512 tmp5949 = _mm512_shuffle_ps(tmp5933, tmp5935, 68);
__m512 tmp5950 = _mm512_shuffle_ps(tmp5933, tmp5935, 238);
__m512 tmp5951 = _mm512_shuffle_ps(tmp5934, tmp5936, 68);
__m512 tmp5952 = _mm512_shuffle_ps(tmp5934, tmp5936, 238);
__m512 tmp5953 = _mm512_shuffle_ps(tmp5937, tmp5939, 68);
__m512 tmp5954 = _mm512_shuffle_ps(tmp5937, tmp5939, 238);
__m512 tmp5955 = _mm512_shuffle_ps(tmp5938, tmp5940, 68);
__m512 tmp5956 = _mm512_shuffle_ps(tmp5938, tmp5940, 238);
__m512 tmp5957 = _mm512_shuffle_f32x4(tmp5941, tmp5945, 136);
__m512 tmp5958 = _mm512_shuffle_f32x4(tmp5941, tmp5945, 221);
__m512 tmp5959 = _mm512_shuffle_f32x4(tmp5942, tmp5946, 136);
__m512 tmp5960 = _mm512_shuffle_f32x4(tmp5942, tmp5946, 221);
__m512 tmp5961 = _mm512_shuffle_f32x4(tmp5943, tmp5947, 136);
__m512 tmp5962 = _mm512_shuffle_f32x4(tmp5943, tmp5947, 221);
__m512 tmp5963 = _mm512_shuffle_f32x4(tmp5944, tmp5948, 136);
__m512 tmp5964 = _mm512_shuffle_f32x4(tmp5944, tmp5948, 221);
__m512 tmp5965 = _mm512_shuffle_f32x4(tmp5949, tmp5953, 136);
__m512 tmp5966 = _mm512_shuffle_f32x4(tmp5949, tmp5953, 221);
__m512 tmp5967 = _mm512_shuffle_f32x4(tmp5950, tmp5954, 136);
__m512 tmp5968 = _mm512_shuffle_f32x4(tmp5950, tmp5954, 221);
__m512 tmp5969 = _mm512_shuffle_f32x4(tmp5951, tmp5955, 136);
__m512 tmp5970 = _mm512_shuffle_f32x4(tmp5951, tmp5955, 221);
__m512 tmp5971 = _mm512_shuffle_f32x4(tmp5952, tmp5956, 136);
__m512 tmp5972 = _mm512_shuffle_f32x4(tmp5952, tmp5956, 221);
wt387 = _mm512_shuffle_f32x4(tmp5957, tmp5965, 136);
wt395 = _mm512_shuffle_f32x4(tmp5957, tmp5965, 221);
wt388 = _mm512_shuffle_f32x4(tmp5959, tmp5967, 136);
wt396 = _mm512_shuffle_f32x4(tmp5959, tmp5967, 221);
wt389 = _mm512_shuffle_f32x4(tmp5961, tmp5969, 136);
wt397 = _mm512_shuffle_f32x4(tmp5961, tmp5969, 221);
wt390 = _mm512_shuffle_f32x4(tmp5963, tmp5971, 136);
wt398 = _mm512_shuffle_f32x4(tmp5963, tmp5971, 221);
wt391 = _mm512_shuffle_f32x4(tmp5958, tmp5966, 136);
wt399 = _mm512_shuffle_f32x4(tmp5958, tmp5966, 221);
wt392 = _mm512_shuffle_f32x4(tmp5960, tmp5968, 136);
wt400 = _mm512_shuffle_f32x4(tmp5960, tmp5968, 221);
wt393 = _mm512_shuffle_f32x4(tmp5962, tmp5970, 136);
wt401 = _mm512_shuffle_f32x4(tmp5962, tmp5970, 221);
wt394 = _mm512_shuffle_f32x4(tmp5964, tmp5972, 136);
wt402 = _mm512_shuffle_f32x4(tmp5964, tmp5972, 221);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(1+16*c32)+(ptrdiff_t)0, 63>>cut15, wt387);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(2+16*c32)+(ptrdiff_t)0, 63>>cut15, wt388);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(3+16*c32)+(ptrdiff_t)0, 63>>cut15, wt389);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(4+16*c32)+(ptrdiff_t)0, 63>>cut15, wt390);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(5+16*c32)+(ptrdiff_t)0, 63>>cut15, wt391);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(6+16*c32)+(ptrdiff_t)0, 63>>cut15, wt392);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(7+16*c32)+(ptrdiff_t)0, 63>>cut15, wt393);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(8+16*c32)+(ptrdiff_t)0, 63>>cut15, wt394);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(9+16*c32)+(ptrdiff_t)0, 63>>cut15, wt395);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(10+16*c32)+(ptrdiff_t)0, 63>>cut15, wt396);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(11+16*c32)+(ptrdiff_t)0, 63>>cut15, wt397);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(12+16*c32)+(ptrdiff_t)0, 63>>cut15, wt398);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(13+16*c32)+(ptrdiff_t)0, 63>>cut15, wt399);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(14+16*c32)+(ptrdiff_t)0, 63>>cut15, wt400);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(15+16*c32)+(ptrdiff_t)0, 63>>cut15, wt401);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(16+16*c32)+(ptrdiff_t)0, 63>>cut15, wt402);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(1+16*c32)+(ptrdiff_t)6144, 4032>>cut15, wt387);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(2+16*c32)+(ptrdiff_t)6144, 4032>>cut15, wt388);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(3+16*c32)+(ptrdiff_t)6144, 4032>>cut15, wt389);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(4+16*c32)+(ptrdiff_t)6144, 4032>>cut15, wt390);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(5+16*c32)+(ptrdiff_t)6144, 4032>>cut15, wt391);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(6+16*c32)+(ptrdiff_t)6144, 4032>>cut15, wt392);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(7+16*c32)+(ptrdiff_t)6144, 4032>>cut15, wt393);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(8+16*c32)+(ptrdiff_t)6144, 4032>>cut15, wt394);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(9+16*c32)+(ptrdiff_t)6144, 4032>>cut15, wt395);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(10+16*c32)+(ptrdiff_t)6144, 4032>>cut15, wt396);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(11+16*c32)+(ptrdiff_t)6144, 4032>>cut15, wt397);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(12+16*c32)+(ptrdiff_t)6144, 4032>>cut15, wt398);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(13+16*c32)+(ptrdiff_t)6144, 4032>>cut15, wt399);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(14+16*c32)+(ptrdiff_t)6144, 4032>>cut15, wt400);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(15+16*c32)+(ptrdiff_t)6144, 4032>>cut15, wt401);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(16+16*c32)+(ptrdiff_t)6144, 4032>>cut15, wt402);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(1+16*c32)+(ptrdiff_t)12288, 258048>>cut15, wt387);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(2+16*c32)+(ptrdiff_t)12288, 258048>>cut15, wt388);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(3+16*c32)+(ptrdiff_t)12288, 258048>>cut15, wt389);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(4+16*c32)+(ptrdiff_t)12288, 258048>>cut15, wt390);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(5+16*c32)+(ptrdiff_t)12288, 258048>>cut15, wt391);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(6+16*c32)+(ptrdiff_t)12288, 258048>>cut15, wt392);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(7+16*c32)+(ptrdiff_t)12288, 258048>>cut15, wt393);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(8+16*c32)+(ptrdiff_t)12288, 258048>>cut15, wt394);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(9+16*c32)+(ptrdiff_t)12288, 258048>>cut15, wt395);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(10+16*c32)+(ptrdiff_t)12288, 258048>>cut15, wt396);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(11+16*c32)+(ptrdiff_t)12288, 258048>>cut15, wt397);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(12+16*c32)+(ptrdiff_t)12288, 258048>>cut15, wt398);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(13+16*c32)+(ptrdiff_t)12288, 258048>>cut15, wt399);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(14+16*c32)+(ptrdiff_t)12288, 258048>>cut15, wt400);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(15+16*c32)+(ptrdiff_t)12288, 258048>>cut15, wt401);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(16+16*c32)+(ptrdiff_t)12288, 258048>>cut15, wt402);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(1+16*c32)+(ptrdiff_t)18432, 65535-(262143>>cut15), wt387);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(2+16*c32)+(ptrdiff_t)18432, 65535-(262143>>cut15), wt388);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(3+16*c32)+(ptrdiff_t)18432, 65535-(262143>>cut15), wt389);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(4+16*c32)+(ptrdiff_t)18432, 65535-(262143>>cut15), wt390);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(5+16*c32)+(ptrdiff_t)18432, 65535-(262143>>cut15), wt391);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(6+16*c32)+(ptrdiff_t)18432, 65535-(262143>>cut15), wt392);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(7+16*c32)+(ptrdiff_t)18432, 65535-(262143>>cut15), wt393);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(8+16*c32)+(ptrdiff_t)18432, 65535-(262143>>cut15), wt394);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(9+16*c32)+(ptrdiff_t)18432, 65535-(262143>>cut15), wt395);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(10+16*c32)+(ptrdiff_t)18432, 65535-(262143>>cut15), wt396);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(11+16*c32)+(ptrdiff_t)18432, 65535-(262143>>cut15), wt397);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(12+16*c32)+(ptrdiff_t)18432, 65535-(262143>>cut15), wt398);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(13+16*c32)+(ptrdiff_t)18432, 65535-(262143>>cut15), wt399);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(14+16*c32)+(ptrdiff_t)18432, 65535-(262143>>cut15), wt400);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(15+16*c32)+(ptrdiff_t)18432, 65535-(262143>>cut15), wt401);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l33+4*cut15+24*(16+16*c32)+(ptrdiff_t)18432, 65535-(262143>>cut15), wt402);
}
}
}
} else {
ptrdiff_t k90 = 112;
ptrdiff_t l32 = (size_t)(0+k90)/6;
ptrdiff_t cut14 = (size_t)(0+k90)%6;
__m512 sum252 = _mm512_maskz_loadu_ps(65535, biasPtr9+512*i39+4*k90);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*0+(ptrdiff_t)0, 63>>cut14, sum252);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*0+(ptrdiff_t)6144, 4032>>cut14, sum252);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*0+(ptrdiff_t)12288, 258048>>cut14, sum252);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+8*0+(ptrdiff_t)18432, 65535-(262143>>cut14), sum252);
ptrdiff_t c30 = 0;
for (; c30 != 16; ++c30) {
__m512 wt355 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k90+64*c30+(ptrdiff_t)0);
__m512 wt356 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k90+64*c30+(ptrdiff_t)1024);
__m512 wt357 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k90+64*c30+(ptrdiff_t)2048);
__m512 wt358 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k90+64*c30+(ptrdiff_t)3072);
__m512 wt359 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k90+64*c30+(ptrdiff_t)4096);
__m512 wt360 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k90+64*c30+(ptrdiff_t)5120);
__m512 wt361 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k90+64*c30+(ptrdiff_t)6144);
__m512 wt362 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k90+64*c30+(ptrdiff_t)7168);
__m512 wt363 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k90+64*c30+(ptrdiff_t)8192);
__m512 wt364 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k90+64*c30+(ptrdiff_t)9216);
__m512 wt365 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k90+64*c30+(ptrdiff_t)10240);
__m512 wt366 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k90+64*c30+(ptrdiff_t)11264);
__m512 wt367 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k90+64*c30+(ptrdiff_t)12288);
__m512 wt368 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k90+64*c30+(ptrdiff_t)13312);
__m512 wt369 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k90+64*c30+(ptrdiff_t)14336);
__m512 wt370 = _mm512_maskz_loadu_ps(65535, wtPtr9+131072*i39+1024*k90+64*c30+(ptrdiff_t)15360);
__m512 tmp5973 = _mm512_unpacklo_ps(wt355, wt356);
__m512 tmp5974 = _mm512_unpackhi_ps(wt355, wt356);
__m512 tmp5975 = _mm512_unpacklo_ps(wt357, wt358);
__m512 tmp5976 = _mm512_unpackhi_ps(wt357, wt358);
__m512 tmp5977 = _mm512_unpacklo_ps(wt359, wt360);
__m512 tmp5978 = _mm512_unpackhi_ps(wt359, wt360);
__m512 tmp5979 = _mm512_unpacklo_ps(wt361, wt362);
__m512 tmp5980 = _mm512_unpackhi_ps(wt361, wt362);
__m512 tmp5981 = _mm512_unpacklo_ps(wt363, wt364);
__m512 tmp5982 = _mm512_unpackhi_ps(wt363, wt364);
__m512 tmp5983 = _mm512_unpacklo_ps(wt365, wt366);
__m512 tmp5984 = _mm512_unpackhi_ps(wt365, wt366);
__m512 tmp5985 = _mm512_unpacklo_ps(wt367, wt368);
__m512 tmp5986 = _mm512_unpackhi_ps(wt367, wt368);
__m512 tmp5987 = _mm512_unpacklo_ps(wt369, wt370);
__m512 tmp5988 = _mm512_unpackhi_ps(wt369, wt370);
__m512 tmp5989 = _mm512_shuffle_ps(tmp5973, tmp5975, 68);
__m512 tmp5990 = _mm512_shuffle_ps(tmp5973, tmp5975, 238);
__m512 tmp5991 = _mm512_shuffle_ps(tmp5974, tmp5976, 68);
__m512 tmp5992 = _mm512_shuffle_ps(tmp5974, tmp5976, 238);
__m512 tmp5993 = _mm512_shuffle_ps(tmp5977, tmp5979, 68);
__m512 tmp5994 = _mm512_shuffle_ps(tmp5977, tmp5979, 238);
__m512 tmp5995 = _mm512_shuffle_ps(tmp5978, tmp5980, 68);
__m512 tmp5996 = _mm512_shuffle_ps(tmp5978, tmp5980, 238);
__m512 tmp5997 = _mm512_shuffle_ps(tmp5981, tmp5983, 68);
__m512 tmp5998 = _mm512_shuffle_ps(tmp5981, tmp5983, 238);
__m512 tmp5999 = _mm512_shuffle_ps(tmp5982, tmp5984, 68);
__m512 tmp6000 = _mm512_shuffle_ps(tmp5982, tmp5984, 238);
__m512 tmp6001 = _mm512_shuffle_ps(tmp5985, tmp5987, 68);
__m512 tmp6002 = _mm512_shuffle_ps(tmp5985, tmp5987, 238);
__m512 tmp6003 = _mm512_shuffle_ps(tmp5986, tmp5988, 68);
__m512 tmp6004 = _mm512_shuffle_ps(tmp5986, tmp5988, 238);
__m512 tmp6005 = _mm512_shuffle_f32x4(tmp5989, tmp5993, 136);
__m512 tmp6006 = _mm512_shuffle_f32x4(tmp5989, tmp5993, 221);
__m512 tmp6007 = _mm512_shuffle_f32x4(tmp5990, tmp5994, 136);
__m512 tmp6008 = _mm512_shuffle_f32x4(tmp5990, tmp5994, 221);
__m512 tmp6009 = _mm512_shuffle_f32x4(tmp5991, tmp5995, 136);
__m512 tmp6010 = _mm512_shuffle_f32x4(tmp5991, tmp5995, 221);
__m512 tmp6011 = _mm512_shuffle_f32x4(tmp5992, tmp5996, 136);
__m512 tmp6012 = _mm512_shuffle_f32x4(tmp5992, tmp5996, 221);
__m512 tmp6013 = _mm512_shuffle_f32x4(tmp5997, tmp6001, 136);
__m512 tmp6014 = _mm512_shuffle_f32x4(tmp5997, tmp6001, 221);
__m512 tmp6015 = _mm512_shuffle_f32x4(tmp5998, tmp6002, 136);
__m512 tmp6016 = _mm512_shuffle_f32x4(tmp5998, tmp6002, 221);
__m512 tmp6017 = _mm512_shuffle_f32x4(tmp5999, tmp6003, 136);
__m512 tmp6018 = _mm512_shuffle_f32x4(tmp5999, tmp6003, 221);
__m512 tmp6019 = _mm512_shuffle_f32x4(tmp6000, tmp6004, 136);
__m512 tmp6020 = _mm512_shuffle_f32x4(tmp6000, tmp6004, 221);
wt355 = _mm512_shuffle_f32x4(tmp6005, tmp6013, 136);
wt363 = _mm512_shuffle_f32x4(tmp6005, tmp6013, 221);
wt356 = _mm512_shuffle_f32x4(tmp6007, tmp6015, 136);
wt364 = _mm512_shuffle_f32x4(tmp6007, tmp6015, 221);
wt357 = _mm512_shuffle_f32x4(tmp6009, tmp6017, 136);
wt365 = _mm512_shuffle_f32x4(tmp6009, tmp6017, 221);
wt358 = _mm512_shuffle_f32x4(tmp6011, tmp6019, 136);
wt366 = _mm512_shuffle_f32x4(tmp6011, tmp6019, 221);
wt359 = _mm512_shuffle_f32x4(tmp6006, tmp6014, 136);
wt367 = _mm512_shuffle_f32x4(tmp6006, tmp6014, 221);
wt360 = _mm512_shuffle_f32x4(tmp6008, tmp6016, 136);
wt368 = _mm512_shuffle_f32x4(tmp6008, tmp6016, 221);
wt361 = _mm512_shuffle_f32x4(tmp6010, tmp6018, 136);
wt369 = _mm512_shuffle_f32x4(tmp6010, tmp6018, 221);
wt362 = _mm512_shuffle_f32x4(tmp6012, tmp6020, 136);
wt370 = _mm512_shuffle_f32x4(tmp6012, tmp6020, 221);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(1+16*c30)+(ptrdiff_t)0, 63>>cut14, wt355);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(2+16*c30)+(ptrdiff_t)0, 63>>cut14, wt356);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(3+16*c30)+(ptrdiff_t)0, 63>>cut14, wt357);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(4+16*c30)+(ptrdiff_t)0, 63>>cut14, wt358);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(5+16*c30)+(ptrdiff_t)0, 63>>cut14, wt359);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(6+16*c30)+(ptrdiff_t)0, 63>>cut14, wt360);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(7+16*c30)+(ptrdiff_t)0, 63>>cut14, wt361);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(8+16*c30)+(ptrdiff_t)0, 63>>cut14, wt362);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(9+16*c30)+(ptrdiff_t)0, 63>>cut14, wt363);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(10+16*c30)+(ptrdiff_t)0, 63>>cut14, wt364);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(11+16*c30)+(ptrdiff_t)0, 63>>cut14, wt365);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(12+16*c30)+(ptrdiff_t)0, 63>>cut14, wt366);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(13+16*c30)+(ptrdiff_t)0, 63>>cut14, wt367);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(14+16*c30)+(ptrdiff_t)0, 63>>cut14, wt368);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(15+16*c30)+(ptrdiff_t)0, 63>>cut14, wt369);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(16+16*c30)+(ptrdiff_t)0, 63>>cut14, wt370);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(1+16*c30)+(ptrdiff_t)6144, 4032>>cut14, wt355);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(2+16*c30)+(ptrdiff_t)6144, 4032>>cut14, wt356);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(3+16*c30)+(ptrdiff_t)6144, 4032>>cut14, wt357);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(4+16*c30)+(ptrdiff_t)6144, 4032>>cut14, wt358);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(5+16*c30)+(ptrdiff_t)6144, 4032>>cut14, wt359);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(6+16*c30)+(ptrdiff_t)6144, 4032>>cut14, wt360);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(7+16*c30)+(ptrdiff_t)6144, 4032>>cut14, wt361);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(8+16*c30)+(ptrdiff_t)6144, 4032>>cut14, wt362);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(9+16*c30)+(ptrdiff_t)6144, 4032>>cut14, wt363);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(10+16*c30)+(ptrdiff_t)6144, 4032>>cut14, wt364);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(11+16*c30)+(ptrdiff_t)6144, 4032>>cut14, wt365);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(12+16*c30)+(ptrdiff_t)6144, 4032>>cut14, wt366);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(13+16*c30)+(ptrdiff_t)6144, 4032>>cut14, wt367);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(14+16*c30)+(ptrdiff_t)6144, 4032>>cut14, wt368);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(15+16*c30)+(ptrdiff_t)6144, 4032>>cut14, wt369);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(16+16*c30)+(ptrdiff_t)6144, 4032>>cut14, wt370);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(1+16*c30)+(ptrdiff_t)12288, 258048>>cut14, wt355);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(2+16*c30)+(ptrdiff_t)12288, 258048>>cut14, wt356);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(3+16*c30)+(ptrdiff_t)12288, 258048>>cut14, wt357);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(4+16*c30)+(ptrdiff_t)12288, 258048>>cut14, wt358);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(5+16*c30)+(ptrdiff_t)12288, 258048>>cut14, wt359);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(6+16*c30)+(ptrdiff_t)12288, 258048>>cut14, wt360);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(7+16*c30)+(ptrdiff_t)12288, 258048>>cut14, wt361);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(8+16*c30)+(ptrdiff_t)12288, 258048>>cut14, wt362);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(9+16*c30)+(ptrdiff_t)12288, 258048>>cut14, wt363);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(10+16*c30)+(ptrdiff_t)12288, 258048>>cut14, wt364);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(11+16*c30)+(ptrdiff_t)12288, 258048>>cut14, wt365);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(12+16*c30)+(ptrdiff_t)12288, 258048>>cut14, wt366);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(13+16*c30)+(ptrdiff_t)12288, 258048>>cut14, wt367);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(14+16*c30)+(ptrdiff_t)12288, 258048>>cut14, wt368);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(15+16*c30)+(ptrdiff_t)12288, 258048>>cut14, wt369);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+24*(16+16*c30)+(ptrdiff_t)12288, 258048>>cut14, wt370);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+8*(1+16*c30)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt355);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+8*(2+16*c30)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt356);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+8*(3+16*c30)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt357);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+8*(4+16*c30)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt358);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+8*(5+16*c30)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt359);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+8*(6+16*c30)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt360);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+8*(7+16*c30)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt361);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+8*(8+16*c30)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt362);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+8*(9+16*c30)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt363);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+8*(10+16*c30)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt364);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+8*(11+16*c30)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt365);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+8*(12+16*c30)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt366);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+8*(13+16*c30)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt367);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+8*(14+16*c30)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt368);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+8*(15+16*c30)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt369);
_mm512_mask_storeu_ps(arranged13+131584*i39+6168*l32+4*cut14+8*(16+16*c30)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt370);
}
}
}
}
}

static void DenseNet121OneArrangeWts7(DenseNet121ThreaderTeam1* team41, char** tensors55) {
DenseNet121ThreaderTask1 task59;
task59.callee1 = DenseNet121OneArrangeWts7Callee1;
task59.any1 = tensors55;
task59.nd1 = 3;
task59.hull1[0] = 4;
task59.hull1[1] = 1;
task59.hull1[2] = 1;
DenseNet121ThreaderDo1(team41, &task59);
}

static void DenseNet121OneArrangeDats7Callee1(DenseNet121ThreaderTask1* task60, int64_t* pt35) {
char** tensors58 = task60->any1;
ptrdiff_t s24 = pt35[0];
ptrdiff_t c33 = pt35[1];
char*restrict datPtr17 = tensors58[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)3227648*0;
char*restrict bnPtr14 = tensors58[1]+(ptrdiff_t)8*((ptrdiff_t)835*0+(ptrdiff_t)256*0);
char*restrict arranged14 = tensors58[2]+(ptrdiff_t)10474240*0+(ptrdiff_t)3211264*0;
ptrdiff_t ii20 = 1;
for (ptrdiff_t i40 = 0; i40 < ii20; ++i40) {
ptrdiff_t j30 = 1*c33;
ptrdiff_t jj39 = j30+0;
for (; j30 != 49; ++j30) {
ptrdiff_t k92 = 128*s24;
ptrdiff_t kk37 = k92+128;
for (; k92 < kk37; ++k92) {
__m512 dat1331 = _mm512_maskz_loadu_ps(65535, datPtr17+3227648*i40+256*j30+12608*k92+(ptrdiff_t)0);
__m512 dat1332 = _mm512_maskz_loadu_ps(65535, datPtr17+3227648*i40+256*j30+12608*k92+(ptrdiff_t)64);
__m512 dat1333 = _mm512_maskz_loadu_ps(65535, datPtr17+3227648*i40+256*j30+12608*k92+(ptrdiff_t)128);
__m512 dat1334 = _mm512_maskz_loadu_ps(65535, datPtr17+3227648*i40+256*j30+12608*k92+(ptrdiff_t)192);
__m512 bnMul7 = _mm512_set1_ps(((float*)bnPtr14+(ptrdiff_t)2*(k92+256*i40))[0]);
__m512 bnAdd7 = _mm512_set1_ps(((float*)bnPtr14+(ptrdiff_t)2*(k92+256*i40))[1]);
dat1331 = _mm512_fmadd_ps(dat1331, bnMul7, bnAdd7);
dat1332 = _mm512_fmadd_ps(dat1332, bnMul7, bnAdd7);
dat1333 = _mm512_fmadd_ps(dat1333, bnMul7, bnAdd7);
dat1334 = _mm512_fmadd_ps(dat1334, bnMul7, bnAdd7);
dat1331 = _mm512_max_ps(_mm512_setzero_ps(), dat1331);
dat1332 = _mm512_max_ps(_mm512_setzero_ps(), dat1332);
dat1333 = _mm512_max_ps(_mm512_setzero_ps(), dat1333);
dat1334 = _mm512_max_ps(_mm512_setzero_ps(), dat1334);
_mm512_mask_storeu_ps(arranged14+3211264*i40+65536*j30+256*k92+(ptrdiff_t)0, 65535, dat1331);
_mm512_mask_storeu_ps(arranged14+3211264*i40+65536*j30+256*k92+(ptrdiff_t)64, 65535, dat1332);
_mm512_mask_storeu_ps(arranged14+3211264*i40+65536*j30+256*k92+(ptrdiff_t)128, 65535, dat1333);
_mm512_mask_storeu_ps(arranged14+3211264*i40+65536*j30+256*k92+(ptrdiff_t)192, 65535, dat1334);
}
if (j30 >= jj39) goto next7;
}
next7:;
}
}

static void DenseNet121OneArrangeDats7(DenseNet121ThreaderTeam1* team42, char** tensors57) {
DenseNet121ThreaderTask1 task61;
task61.callee1 = DenseNet121OneArrangeDats7Callee1;
task61.any1 = tensors57;
task61.nd1 = 4;
task61.hull1[0] = 2;
task61.hull1[1] = 49;
task61.hull1[2] = 1;
task61.hull1[3] = 1;
DenseNet121ThreaderDo1(team42, &task61);
}

static void DenseNet121OneApply7Callee1(DenseNet121ThreaderTask1* task62, int64_t* pt36) {
void** pair16 = task62->any1;
char** tensors60 = pair16[0];
ptrdiff_t e15 = 0;
ptrdiff_t g16 = 0;
ptrdiff_t d11 = pt36[1];
ptrdiff_t w39 = pt36[0];
char*restrict arrangedWts7 = tensors60[0]+428032*e15+(ptrdiff_t)131584*1*g16;
char*restrict arrangedDats7 = tensors60[1]+10474240*e15+(ptrdiff_t)3211264*1*g16;
char*restrict datPtr18 = tensors60[2]+(ptrdiff_t)1613824*1*g16;
ptrdiff_t ii21 = 1;
for (ptrdiff_t i41 = 0; i41 < ii21; ++i41) {
ptrdiff_t j31 = 1*d11;
ptrdiff_t jj40 = j31+0;
for (; j31 != 49; ++j31) {
ptrdiff_t k93 = 2*w39;
ptrdiff_t kk38 = k93+1;
for (; k93 != 21; ++k93) {
ptrdiff_t s25 = -1;
__m512 sum255 = _mm512_set1_ps(*(float*)(arrangedWts7+131584*i41+6168*k93+24*s25+(ptrdiff_t)24));
__m512 sum259 = _mm512_set1_ps(*(float*)(arrangedWts7+131584*i41+6168*k93+24*s25+(ptrdiff_t)28));
__m512 sum263 = _mm512_set1_ps(*(float*)(arrangedWts7+131584*i41+6168*k93+24*s25+(ptrdiff_t)32));
__m512 sum267 = _mm512_set1_ps(*(float*)(arrangedWts7+131584*i41+6168*k93+24*s25+(ptrdiff_t)36));
__m512 sum271 = _mm512_set1_ps(*(float*)(arrangedWts7+131584*i41+6168*k93+24*s25+(ptrdiff_t)40));
__m512 sum275 = _mm512_set1_ps(*(float*)(arrangedWts7+131584*i41+6168*k93+24*s25+(ptrdiff_t)44));
__m512 sum256 = sum255;
__m512 sum257 = sum255;
__m512 sum258 = sum255;
__m512 sum260 = sum259;
__m512 sum261 = sum259;
__m512 sum262 = sum259;
__m512 sum264 = sum263;
__m512 sum265 = sum263;
__m512 sum266 = sum263;
__m512 sum268 = sum267;
__m512 sum269 = sum267;
__m512 sum270 = sum267;
__m512 sum272 = sum271;
__m512 sum273 = sum271;
__m512 sum274 = sum271;
__m512 sum276 = sum275;
__m512 sum277 = sum275;
__m512 sum278 = sum275;
for (s25 = 0; s25 < 256; ++s25) {
__m512 dat1335 = _mm512_loadu_ps(arrangedDats7+3211264*i41+65536*j31+256*s25+(ptrdiff_t)0);
__m512 dat1336 = _mm512_loadu_ps(arrangedDats7+3211264*i41+65536*j31+256*s25+(ptrdiff_t)64);
__m512 dat1337 = _mm512_loadu_ps(arrangedDats7+3211264*i41+65536*j31+256*s25+(ptrdiff_t)128);
__m512 dat1338 = _mm512_loadu_ps(arrangedDats7+3211264*i41+65536*j31+256*s25+(ptrdiff_t)192);
__m512 wt403 = _mm512_set1_ps(*(float*)(arrangedWts7+131584*i41+6168*k93+24*s25+(ptrdiff_t)24));
sum255 = _mm512_fmadd_ps(wt403, dat1335, sum255);
sum256 = _mm512_fmadd_ps(wt403, dat1336, sum256);
sum257 = _mm512_fmadd_ps(wt403, dat1337, sum257);
sum258 = _mm512_fmadd_ps(wt403, dat1338, sum258);
__m512 wt404 = _mm512_set1_ps(*(float*)(arrangedWts7+131584*i41+6168*k93+24*s25+(ptrdiff_t)28));
sum259 = _mm512_fmadd_ps(wt404, dat1335, sum259);
sum260 = _mm512_fmadd_ps(wt404, dat1336, sum260);
sum261 = _mm512_fmadd_ps(wt404, dat1337, sum261);
sum262 = _mm512_fmadd_ps(wt404, dat1338, sum262);
__m512 wt405 = _mm512_set1_ps(*(float*)(arrangedWts7+131584*i41+6168*k93+24*s25+(ptrdiff_t)32));
sum263 = _mm512_fmadd_ps(wt405, dat1335, sum263);
sum264 = _mm512_fmadd_ps(wt405, dat1336, sum264);
sum265 = _mm512_fmadd_ps(wt405, dat1337, sum265);
sum266 = _mm512_fmadd_ps(wt405, dat1338, sum266);
__m512 wt406 = _mm512_set1_ps(*(float*)(arrangedWts7+131584*i41+6168*k93+24*s25+(ptrdiff_t)36));
sum267 = _mm512_fmadd_ps(wt406, dat1335, sum267);
sum268 = _mm512_fmadd_ps(wt406, dat1336, sum268);
sum269 = _mm512_fmadd_ps(wt406, dat1337, sum269);
sum270 = _mm512_fmadd_ps(wt406, dat1338, sum270);
__m512 wt407 = _mm512_set1_ps(*(float*)(arrangedWts7+131584*i41+6168*k93+24*s25+(ptrdiff_t)40));
sum271 = _mm512_fmadd_ps(wt407, dat1335, sum271);
sum272 = _mm512_fmadd_ps(wt407, dat1336, sum272);
sum273 = _mm512_fmadd_ps(wt407, dat1337, sum273);
sum274 = _mm512_fmadd_ps(wt407, dat1338, sum274);
__m512 wt408 = _mm512_set1_ps(*(float*)(arrangedWts7+131584*i41+6168*k93+24*s25+(ptrdiff_t)44));
sum275 = _mm512_fmadd_ps(wt408, dat1335, sum275);
sum276 = _mm512_fmadd_ps(wt408, dat1336, sum276);
sum277 = _mm512_fmadd_ps(wt408, dat1337, sum277);
sum278 = _mm512_fmadd_ps(wt408, dat1338, sum278);
}
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)0, 65535, sum255);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)64, 65535, sum256);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)128, 65535, sum257);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)192, 65535, sum258);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)12608, 65535, sum259);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)12672, 65535, sum260);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)12736, 65535, sum261);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)12800, 65535, sum262);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)25216, 65535, sum263);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)25280, 65535, sum264);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)25344, 65535, sum265);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)25408, 65535, sum266);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)37824, 65535, sum267);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)37888, 65535, sum268);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)37952, 65535, sum269);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)38016, 65535, sum270);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)50432, 65535, sum271);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)50496, 65535, sum272);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)50560, 65535, sum273);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)50624, 65535, sum274);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)63040, 65535, sum275);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)63104, 65535, sum276);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)63168, 65535, sum277);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)63232, 65535, sum278);
if (k93 >= kk38) return;
}
ptrdiff_t s26 = -1;
__m512 sum279 = _mm512_set1_ps(*(float*)(arrangedWts7+131584*i41+6168*k93+8*s26+(ptrdiff_t)8));
__m512 sum283 = _mm512_set1_ps(*(float*)(arrangedWts7+131584*i41+6168*k93+8*s26+(ptrdiff_t)12));
__m512 sum280 = sum279;
__m512 sum281 = sum279;
__m512 sum282 = sum279;
__m512 sum284 = sum283;
__m512 sum285 = sum283;
__m512 sum286 = sum283;
for (s26 = 0; s26 < 256; ++s26) {
__m512 dat1339 = _mm512_loadu_ps(arrangedDats7+3211264*i41+65536*j31+256*s26+(ptrdiff_t)0);
__m512 dat1340 = _mm512_loadu_ps(arrangedDats7+3211264*i41+65536*j31+256*s26+(ptrdiff_t)64);
__m512 dat1341 = _mm512_loadu_ps(arrangedDats7+3211264*i41+65536*j31+256*s26+(ptrdiff_t)128);
__m512 dat1342 = _mm512_loadu_ps(arrangedDats7+3211264*i41+65536*j31+256*s26+(ptrdiff_t)192);
__m512 wt409 = _mm512_set1_ps(*(float*)(arrangedWts7+131584*i41+6168*k93+8*s26+(ptrdiff_t)8));
sum279 = _mm512_fmadd_ps(wt409, dat1339, sum279);
sum280 = _mm512_fmadd_ps(wt409, dat1340, sum280);
sum281 = _mm512_fmadd_ps(wt409, dat1341, sum281);
sum282 = _mm512_fmadd_ps(wt409, dat1342, sum282);
__m512 wt410 = _mm512_set1_ps(*(float*)(arrangedWts7+131584*i41+6168*k93+8*s26+(ptrdiff_t)12));
sum283 = _mm512_fmadd_ps(wt410, dat1339, sum283);
sum284 = _mm512_fmadd_ps(wt410, dat1340, sum284);
sum285 = _mm512_fmadd_ps(wt410, dat1341, sum285);
sum286 = _mm512_fmadd_ps(wt410, dat1342, sum286);
}
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)0, 65535, sum279);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)64, 65535, sum280);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)128, 65535, sum281);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)192, 65535, sum282);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)12608, 65535, sum283);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)12672, 65535, sum284);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)12736, 65535, sum285);
_mm512_mask_storeu_ps(datPtr18+1613824*i41+256*j31+75648*k93+(ptrdiff_t)12800, 65535, sum286);
if (j31 >= jj40) return;
}
}
}

static void DenseNet121OneApply7(DenseNet121ThreaderTeam1* team43, char** tensors59) {
void* pair15[] = {tensors59, 0};
DenseNet121ThreaderTask1 task63;
task63.callee1 = DenseNet121OneApply7Callee1;
task63.any1 = pair15;
task63.nd1 = 3;
task63.hull1[0] = 11;
task63.hull1[1] = 49;
task63.hull1[2] = 1;
DenseNet121ThreaderDo1(team43, &task63);
}

static void DenseNet121OneArrangeWts8Callee1(DenseNet121ThreaderTask1* task66, int64_t* pt38) {
char** tensors64 = task66->any1;
ptrdiff_t b53 = pt38[0];
char*restrict wtPtr10 = tensors64[0]+(ptrdiff_t)3340*0+(ptrdiff_t)65536*0;
char*restrict biasPtr10 = tensors64[1]+(ptrdiff_t)512*0;
char*restrict bnPtr15 = tensors64[2]+(ptrdiff_t)8*128*0;
char*restrict arranged15 = tensors64[3]+(ptrdiff_t)428032*0+(ptrdiff_t)66048*0;
ptrdiff_t ii22 = 1;
for (ptrdiff_t i44 = 0; i44 < ii22; ++i44) {
ptrdiff_t j34 = 4*b53;
ptrdiff_t jj41 = j34+4;
for (; j34 < jj41; ++j34) {
if (j34 < 7) {
ptrdiff_t k97 = 0+16*(j34-0);
ptrdiff_t l35 = (size_t)(0+k97)/6;
ptrdiff_t cut17 = (size_t)(0+k97)%6;
switch (cut17) {
case 0:;
case 2: {
__m512 sum288 = _mm512_maskz_loadu_ps(65535, biasPtr10+512*i44+4*k97);
__m512i pmMul20 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd20 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo19 = _mm512_loadu_ps(bnPtr15+(ptrdiff_t)8*(k97+128*i44));
__m512 masHi19 = _mm512_maskz_loadu_ps(65535, bnPtr15+(ptrdiff_t)8*(k97+128*i44)+(ptrdiff_t)64);
__m512 postMul23 = _mm512_permutex2var_ps(masLo19, pmMul20, masHi19);
__m512 postAdd21 = _mm512_permutex2var_ps(masLo19, pmAdd20, masHi19);
sum288 = _mm512_fmadd_ps(sum288, postMul23, postAdd21);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*0+(ptrdiff_t)0, 63>>cut17, sum288);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*0+(ptrdiff_t)3072, 4032>>cut17, sum288);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*0+(ptrdiff_t)6144, 65535-(4095>>cut17), sum288);
ptrdiff_t c36 = 0;
for (; c36 != 8; ++c36) {
__m512 wt427 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c36+(ptrdiff_t)0);
__m512 wt428 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c36+(ptrdiff_t)512);
__m512 wt429 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c36+(ptrdiff_t)1024);
__m512 wt430 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c36+(ptrdiff_t)1536);
__m512 wt431 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c36+(ptrdiff_t)2048);
__m512 wt432 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c36+(ptrdiff_t)2560);
__m512 wt433 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c36+(ptrdiff_t)3072);
__m512 wt434 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c36+(ptrdiff_t)3584);
__m512 wt435 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c36+(ptrdiff_t)4096);
__m512 wt436 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c36+(ptrdiff_t)4608);
__m512 wt437 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c36+(ptrdiff_t)5120);
__m512 wt438 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c36+(ptrdiff_t)5632);
__m512 wt439 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c36+(ptrdiff_t)6144);
__m512 wt440 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c36+(ptrdiff_t)6656);
__m512 wt441 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c36+(ptrdiff_t)7168);
__m512 wt442 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c36+(ptrdiff_t)7680);
__m512 tmp6021 = _mm512_unpacklo_ps(wt427, wt428);
__m512 tmp6022 = _mm512_unpackhi_ps(wt427, wt428);
__m512 tmp6023 = _mm512_unpacklo_ps(wt429, wt430);
__m512 tmp6024 = _mm512_unpackhi_ps(wt429, wt430);
__m512 tmp6025 = _mm512_unpacklo_ps(wt431, wt432);
__m512 tmp6026 = _mm512_unpackhi_ps(wt431, wt432);
__m512 tmp6027 = _mm512_unpacklo_ps(wt433, wt434);
__m512 tmp6028 = _mm512_unpackhi_ps(wt433, wt434);
__m512 tmp6029 = _mm512_unpacklo_ps(wt435, wt436);
__m512 tmp6030 = _mm512_unpackhi_ps(wt435, wt436);
__m512 tmp6031 = _mm512_unpacklo_ps(wt437, wt438);
__m512 tmp6032 = _mm512_unpackhi_ps(wt437, wt438);
__m512 tmp6033 = _mm512_unpacklo_ps(wt439, wt440);
__m512 tmp6034 = _mm512_unpackhi_ps(wt439, wt440);
__m512 tmp6035 = _mm512_unpacklo_ps(wt441, wt442);
__m512 tmp6036 = _mm512_unpackhi_ps(wt441, wt442);
__m512 tmp6037 = _mm512_shuffle_ps(tmp6021, tmp6023, 68);
__m512 tmp6038 = _mm512_shuffle_ps(tmp6021, tmp6023, 238);
__m512 tmp6039 = _mm512_shuffle_ps(tmp6022, tmp6024, 68);
__m512 tmp6040 = _mm512_shuffle_ps(tmp6022, tmp6024, 238);
__m512 tmp6041 = _mm512_shuffle_ps(tmp6025, tmp6027, 68);
__m512 tmp6042 = _mm512_shuffle_ps(tmp6025, tmp6027, 238);
__m512 tmp6043 = _mm512_shuffle_ps(tmp6026, tmp6028, 68);
__m512 tmp6044 = _mm512_shuffle_ps(tmp6026, tmp6028, 238);
__m512 tmp6045 = _mm512_shuffle_ps(tmp6029, tmp6031, 68);
__m512 tmp6046 = _mm512_shuffle_ps(tmp6029, tmp6031, 238);
__m512 tmp6047 = _mm512_shuffle_ps(tmp6030, tmp6032, 68);
__m512 tmp6048 = _mm512_shuffle_ps(tmp6030, tmp6032, 238);
__m512 tmp6049 = _mm512_shuffle_ps(tmp6033, tmp6035, 68);
__m512 tmp6050 = _mm512_shuffle_ps(tmp6033, tmp6035, 238);
__m512 tmp6051 = _mm512_shuffle_ps(tmp6034, tmp6036, 68);
__m512 tmp6052 = _mm512_shuffle_ps(tmp6034, tmp6036, 238);
__m512 tmp6053 = _mm512_shuffle_f32x4(tmp6037, tmp6041, 136);
__m512 tmp6054 = _mm512_shuffle_f32x4(tmp6037, tmp6041, 221);
__m512 tmp6055 = _mm512_shuffle_f32x4(tmp6038, tmp6042, 136);
__m512 tmp6056 = _mm512_shuffle_f32x4(tmp6038, tmp6042, 221);
__m512 tmp6057 = _mm512_shuffle_f32x4(tmp6039, tmp6043, 136);
__m512 tmp6058 = _mm512_shuffle_f32x4(tmp6039, tmp6043, 221);
__m512 tmp6059 = _mm512_shuffle_f32x4(tmp6040, tmp6044, 136);
__m512 tmp6060 = _mm512_shuffle_f32x4(tmp6040, tmp6044, 221);
__m512 tmp6061 = _mm512_shuffle_f32x4(tmp6045, tmp6049, 136);
__m512 tmp6062 = _mm512_shuffle_f32x4(tmp6045, tmp6049, 221);
__m512 tmp6063 = _mm512_shuffle_f32x4(tmp6046, tmp6050, 136);
__m512 tmp6064 = _mm512_shuffle_f32x4(tmp6046, tmp6050, 221);
__m512 tmp6065 = _mm512_shuffle_f32x4(tmp6047, tmp6051, 136);
__m512 tmp6066 = _mm512_shuffle_f32x4(tmp6047, tmp6051, 221);
__m512 tmp6067 = _mm512_shuffle_f32x4(tmp6048, tmp6052, 136);
__m512 tmp6068 = _mm512_shuffle_f32x4(tmp6048, tmp6052, 221);
wt427 = _mm512_shuffle_f32x4(tmp6053, tmp6061, 136);
wt435 = _mm512_shuffle_f32x4(tmp6053, tmp6061, 221);
wt428 = _mm512_shuffle_f32x4(tmp6055, tmp6063, 136);
wt436 = _mm512_shuffle_f32x4(tmp6055, tmp6063, 221);
wt429 = _mm512_shuffle_f32x4(tmp6057, tmp6065, 136);
wt437 = _mm512_shuffle_f32x4(tmp6057, tmp6065, 221);
wt430 = _mm512_shuffle_f32x4(tmp6059, tmp6067, 136);
wt438 = _mm512_shuffle_f32x4(tmp6059, tmp6067, 221);
wt431 = _mm512_shuffle_f32x4(tmp6054, tmp6062, 136);
wt439 = _mm512_shuffle_f32x4(tmp6054, tmp6062, 221);
wt432 = _mm512_shuffle_f32x4(tmp6056, tmp6064, 136);
wt440 = _mm512_shuffle_f32x4(tmp6056, tmp6064, 221);
wt433 = _mm512_shuffle_f32x4(tmp6058, tmp6066, 136);
wt441 = _mm512_shuffle_f32x4(tmp6058, tmp6066, 221);
wt434 = _mm512_shuffle_f32x4(tmp6060, tmp6068, 136);
wt442 = _mm512_shuffle_f32x4(tmp6060, tmp6068, 221);
wt427 = _mm512_mul_ps(wt427, postMul23);
wt428 = _mm512_mul_ps(wt428, postMul23);
wt429 = _mm512_mul_ps(wt429, postMul23);
wt430 = _mm512_mul_ps(wt430, postMul23);
wt431 = _mm512_mul_ps(wt431, postMul23);
wt432 = _mm512_mul_ps(wt432, postMul23);
wt433 = _mm512_mul_ps(wt433, postMul23);
wt434 = _mm512_mul_ps(wt434, postMul23);
wt435 = _mm512_mul_ps(wt435, postMul23);
wt436 = _mm512_mul_ps(wt436, postMul23);
wt437 = _mm512_mul_ps(wt437, postMul23);
wt438 = _mm512_mul_ps(wt438, postMul23);
wt439 = _mm512_mul_ps(wt439, postMul23);
wt440 = _mm512_mul_ps(wt440, postMul23);
wt441 = _mm512_mul_ps(wt441, postMul23);
wt442 = _mm512_mul_ps(wt442, postMul23);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(1+16*c36)+(ptrdiff_t)0, 63>>cut17, wt427);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(2+16*c36)+(ptrdiff_t)0, 63>>cut17, wt428);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(3+16*c36)+(ptrdiff_t)0, 63>>cut17, wt429);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(4+16*c36)+(ptrdiff_t)0, 63>>cut17, wt430);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(5+16*c36)+(ptrdiff_t)0, 63>>cut17, wt431);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(6+16*c36)+(ptrdiff_t)0, 63>>cut17, wt432);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(7+16*c36)+(ptrdiff_t)0, 63>>cut17, wt433);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(8+16*c36)+(ptrdiff_t)0, 63>>cut17, wt434);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(9+16*c36)+(ptrdiff_t)0, 63>>cut17, wt435);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(10+16*c36)+(ptrdiff_t)0, 63>>cut17, wt436);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(11+16*c36)+(ptrdiff_t)0, 63>>cut17, wt437);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(12+16*c36)+(ptrdiff_t)0, 63>>cut17, wt438);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(13+16*c36)+(ptrdiff_t)0, 63>>cut17, wt439);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(14+16*c36)+(ptrdiff_t)0, 63>>cut17, wt440);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(15+16*c36)+(ptrdiff_t)0, 63>>cut17, wt441);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(16+16*c36)+(ptrdiff_t)0, 63>>cut17, wt442);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(1+16*c36)+(ptrdiff_t)3072, 4032>>cut17, wt427);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(2+16*c36)+(ptrdiff_t)3072, 4032>>cut17, wt428);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(3+16*c36)+(ptrdiff_t)3072, 4032>>cut17, wt429);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(4+16*c36)+(ptrdiff_t)3072, 4032>>cut17, wt430);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(5+16*c36)+(ptrdiff_t)3072, 4032>>cut17, wt431);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(6+16*c36)+(ptrdiff_t)3072, 4032>>cut17, wt432);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(7+16*c36)+(ptrdiff_t)3072, 4032>>cut17, wt433);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(8+16*c36)+(ptrdiff_t)3072, 4032>>cut17, wt434);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(9+16*c36)+(ptrdiff_t)3072, 4032>>cut17, wt435);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(10+16*c36)+(ptrdiff_t)3072, 4032>>cut17, wt436);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(11+16*c36)+(ptrdiff_t)3072, 4032>>cut17, wt437);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(12+16*c36)+(ptrdiff_t)3072, 4032>>cut17, wt438);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(13+16*c36)+(ptrdiff_t)3072, 4032>>cut17, wt439);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(14+16*c36)+(ptrdiff_t)3072, 4032>>cut17, wt440);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(15+16*c36)+(ptrdiff_t)3072, 4032>>cut17, wt441);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(16+16*c36)+(ptrdiff_t)3072, 4032>>cut17, wt442);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(1+16*c36)+(ptrdiff_t)6144, 65535-(4095>>cut17), wt427);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(2+16*c36)+(ptrdiff_t)6144, 65535-(4095>>cut17), wt428);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(3+16*c36)+(ptrdiff_t)6144, 65535-(4095>>cut17), wt429);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(4+16*c36)+(ptrdiff_t)6144, 65535-(4095>>cut17), wt430);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(5+16*c36)+(ptrdiff_t)6144, 65535-(4095>>cut17), wt431);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(6+16*c36)+(ptrdiff_t)6144, 65535-(4095>>cut17), wt432);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(7+16*c36)+(ptrdiff_t)6144, 65535-(4095>>cut17), wt433);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(8+16*c36)+(ptrdiff_t)6144, 65535-(4095>>cut17), wt434);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(9+16*c36)+(ptrdiff_t)6144, 65535-(4095>>cut17), wt435);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(10+16*c36)+(ptrdiff_t)6144, 65535-(4095>>cut17), wt436);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(11+16*c36)+(ptrdiff_t)6144, 65535-(4095>>cut17), wt437);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(12+16*c36)+(ptrdiff_t)6144, 65535-(4095>>cut17), wt438);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(13+16*c36)+(ptrdiff_t)6144, 65535-(4095>>cut17), wt439);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(14+16*c36)+(ptrdiff_t)6144, 65535-(4095>>cut17), wt440);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(15+16*c36)+(ptrdiff_t)6144, 65535-(4095>>cut17), wt441);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(16+16*c36)+(ptrdiff_t)6144, 65535-(4095>>cut17), wt442);
}
break;
}
default: {
cut17 = 4;
__m512 sum289 = _mm512_maskz_loadu_ps(65535, biasPtr10+512*i44+4*k97);
__m512i pmMul21 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd21 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo20 = _mm512_loadu_ps(bnPtr15+(ptrdiff_t)8*(k97+128*i44));
__m512 masHi20 = _mm512_maskz_loadu_ps(65535, bnPtr15+(ptrdiff_t)8*(k97+128*i44)+(ptrdiff_t)64);
__m512 postMul24 = _mm512_permutex2var_ps(masLo20, pmMul21, masHi20);
__m512 postAdd22 = _mm512_permutex2var_ps(masLo20, pmAdd21, masHi20);
sum289 = _mm512_fmadd_ps(sum289, postMul24, postAdd22);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*0+(ptrdiff_t)0, 63>>cut17, sum289);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*0+(ptrdiff_t)3072, 4032>>cut17, sum289);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*0+(ptrdiff_t)6144, 258048>>cut17, sum289);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*0+(ptrdiff_t)9216, 65535-(262143>>cut17), sum289);
ptrdiff_t c37 = 0;
for (; c37 != 8; ++c37) {
__m512 wt443 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c37+(ptrdiff_t)0);
__m512 wt444 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c37+(ptrdiff_t)512);
__m512 wt445 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c37+(ptrdiff_t)1024);
__m512 wt446 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c37+(ptrdiff_t)1536);
__m512 wt447 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c37+(ptrdiff_t)2048);
__m512 wt448 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c37+(ptrdiff_t)2560);
__m512 wt449 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c37+(ptrdiff_t)3072);
__m512 wt450 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c37+(ptrdiff_t)3584);
__m512 wt451 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c37+(ptrdiff_t)4096);
__m512 wt452 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c37+(ptrdiff_t)4608);
__m512 wt453 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c37+(ptrdiff_t)5120);
__m512 wt454 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c37+(ptrdiff_t)5632);
__m512 wt455 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c37+(ptrdiff_t)6144);
__m512 wt456 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c37+(ptrdiff_t)6656);
__m512 wt457 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c37+(ptrdiff_t)7168);
__m512 wt458 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k97+64*c37+(ptrdiff_t)7680);
__m512 tmp6069 = _mm512_unpacklo_ps(wt443, wt444);
__m512 tmp6070 = _mm512_unpackhi_ps(wt443, wt444);
__m512 tmp6071 = _mm512_unpacklo_ps(wt445, wt446);
__m512 tmp6072 = _mm512_unpackhi_ps(wt445, wt446);
__m512 tmp6073 = _mm512_unpacklo_ps(wt447, wt448);
__m512 tmp6074 = _mm512_unpackhi_ps(wt447, wt448);
__m512 tmp6075 = _mm512_unpacklo_ps(wt449, wt450);
__m512 tmp6076 = _mm512_unpackhi_ps(wt449, wt450);
__m512 tmp6077 = _mm512_unpacklo_ps(wt451, wt452);
__m512 tmp6078 = _mm512_unpackhi_ps(wt451, wt452);
__m512 tmp6079 = _mm512_unpacklo_ps(wt453, wt454);
__m512 tmp6080 = _mm512_unpackhi_ps(wt453, wt454);
__m512 tmp6081 = _mm512_unpacklo_ps(wt455, wt456);
__m512 tmp6082 = _mm512_unpackhi_ps(wt455, wt456);
__m512 tmp6083 = _mm512_unpacklo_ps(wt457, wt458);
__m512 tmp6084 = _mm512_unpackhi_ps(wt457, wt458);
__m512 tmp6085 = _mm512_shuffle_ps(tmp6069, tmp6071, 68);
__m512 tmp6086 = _mm512_shuffle_ps(tmp6069, tmp6071, 238);
__m512 tmp6087 = _mm512_shuffle_ps(tmp6070, tmp6072, 68);
__m512 tmp6088 = _mm512_shuffle_ps(tmp6070, tmp6072, 238);
__m512 tmp6089 = _mm512_shuffle_ps(tmp6073, tmp6075, 68);
__m512 tmp6090 = _mm512_shuffle_ps(tmp6073, tmp6075, 238);
__m512 tmp6091 = _mm512_shuffle_ps(tmp6074, tmp6076, 68);
__m512 tmp6092 = _mm512_shuffle_ps(tmp6074, tmp6076, 238);
__m512 tmp6093 = _mm512_shuffle_ps(tmp6077, tmp6079, 68);
__m512 tmp6094 = _mm512_shuffle_ps(tmp6077, tmp6079, 238);
__m512 tmp6095 = _mm512_shuffle_ps(tmp6078, tmp6080, 68);
__m512 tmp6096 = _mm512_shuffle_ps(tmp6078, tmp6080, 238);
__m512 tmp6097 = _mm512_shuffle_ps(tmp6081, tmp6083, 68);
__m512 tmp6098 = _mm512_shuffle_ps(tmp6081, tmp6083, 238);
__m512 tmp6099 = _mm512_shuffle_ps(tmp6082, tmp6084, 68);
__m512 tmp6100 = _mm512_shuffle_ps(tmp6082, tmp6084, 238);
__m512 tmp6101 = _mm512_shuffle_f32x4(tmp6085, tmp6089, 136);
__m512 tmp6102 = _mm512_shuffle_f32x4(tmp6085, tmp6089, 221);
__m512 tmp6103 = _mm512_shuffle_f32x4(tmp6086, tmp6090, 136);
__m512 tmp6104 = _mm512_shuffle_f32x4(tmp6086, tmp6090, 221);
__m512 tmp6105 = _mm512_shuffle_f32x4(tmp6087, tmp6091, 136);
__m512 tmp6106 = _mm512_shuffle_f32x4(tmp6087, tmp6091, 221);
__m512 tmp6107 = _mm512_shuffle_f32x4(tmp6088, tmp6092, 136);
__m512 tmp6108 = _mm512_shuffle_f32x4(tmp6088, tmp6092, 221);
__m512 tmp6109 = _mm512_shuffle_f32x4(tmp6093, tmp6097, 136);
__m512 tmp6110 = _mm512_shuffle_f32x4(tmp6093, tmp6097, 221);
__m512 tmp6111 = _mm512_shuffle_f32x4(tmp6094, tmp6098, 136);
__m512 tmp6112 = _mm512_shuffle_f32x4(tmp6094, tmp6098, 221);
__m512 tmp6113 = _mm512_shuffle_f32x4(tmp6095, tmp6099, 136);
__m512 tmp6114 = _mm512_shuffle_f32x4(tmp6095, tmp6099, 221);
__m512 tmp6115 = _mm512_shuffle_f32x4(tmp6096, tmp6100, 136);
__m512 tmp6116 = _mm512_shuffle_f32x4(tmp6096, tmp6100, 221);
wt443 = _mm512_shuffle_f32x4(tmp6101, tmp6109, 136);
wt451 = _mm512_shuffle_f32x4(tmp6101, tmp6109, 221);
wt444 = _mm512_shuffle_f32x4(tmp6103, tmp6111, 136);
wt452 = _mm512_shuffle_f32x4(tmp6103, tmp6111, 221);
wt445 = _mm512_shuffle_f32x4(tmp6105, tmp6113, 136);
wt453 = _mm512_shuffle_f32x4(tmp6105, tmp6113, 221);
wt446 = _mm512_shuffle_f32x4(tmp6107, tmp6115, 136);
wt454 = _mm512_shuffle_f32x4(tmp6107, tmp6115, 221);
wt447 = _mm512_shuffle_f32x4(tmp6102, tmp6110, 136);
wt455 = _mm512_shuffle_f32x4(tmp6102, tmp6110, 221);
wt448 = _mm512_shuffle_f32x4(tmp6104, tmp6112, 136);
wt456 = _mm512_shuffle_f32x4(tmp6104, tmp6112, 221);
wt449 = _mm512_shuffle_f32x4(tmp6106, tmp6114, 136);
wt457 = _mm512_shuffle_f32x4(tmp6106, tmp6114, 221);
wt450 = _mm512_shuffle_f32x4(tmp6108, tmp6116, 136);
wt458 = _mm512_shuffle_f32x4(tmp6108, tmp6116, 221);
wt443 = _mm512_mul_ps(wt443, postMul24);
wt444 = _mm512_mul_ps(wt444, postMul24);
wt445 = _mm512_mul_ps(wt445, postMul24);
wt446 = _mm512_mul_ps(wt446, postMul24);
wt447 = _mm512_mul_ps(wt447, postMul24);
wt448 = _mm512_mul_ps(wt448, postMul24);
wt449 = _mm512_mul_ps(wt449, postMul24);
wt450 = _mm512_mul_ps(wt450, postMul24);
wt451 = _mm512_mul_ps(wt451, postMul24);
wt452 = _mm512_mul_ps(wt452, postMul24);
wt453 = _mm512_mul_ps(wt453, postMul24);
wt454 = _mm512_mul_ps(wt454, postMul24);
wt455 = _mm512_mul_ps(wt455, postMul24);
wt456 = _mm512_mul_ps(wt456, postMul24);
wt457 = _mm512_mul_ps(wt457, postMul24);
wt458 = _mm512_mul_ps(wt458, postMul24);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(1+16*c37)+(ptrdiff_t)0, 63>>cut17, wt443);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(2+16*c37)+(ptrdiff_t)0, 63>>cut17, wt444);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(3+16*c37)+(ptrdiff_t)0, 63>>cut17, wt445);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(4+16*c37)+(ptrdiff_t)0, 63>>cut17, wt446);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(5+16*c37)+(ptrdiff_t)0, 63>>cut17, wt447);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(6+16*c37)+(ptrdiff_t)0, 63>>cut17, wt448);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(7+16*c37)+(ptrdiff_t)0, 63>>cut17, wt449);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(8+16*c37)+(ptrdiff_t)0, 63>>cut17, wt450);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(9+16*c37)+(ptrdiff_t)0, 63>>cut17, wt451);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(10+16*c37)+(ptrdiff_t)0, 63>>cut17, wt452);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(11+16*c37)+(ptrdiff_t)0, 63>>cut17, wt453);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(12+16*c37)+(ptrdiff_t)0, 63>>cut17, wt454);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(13+16*c37)+(ptrdiff_t)0, 63>>cut17, wt455);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(14+16*c37)+(ptrdiff_t)0, 63>>cut17, wt456);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(15+16*c37)+(ptrdiff_t)0, 63>>cut17, wt457);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(16+16*c37)+(ptrdiff_t)0, 63>>cut17, wt458);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(1+16*c37)+(ptrdiff_t)3072, 4032>>cut17, wt443);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(2+16*c37)+(ptrdiff_t)3072, 4032>>cut17, wt444);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(3+16*c37)+(ptrdiff_t)3072, 4032>>cut17, wt445);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(4+16*c37)+(ptrdiff_t)3072, 4032>>cut17, wt446);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(5+16*c37)+(ptrdiff_t)3072, 4032>>cut17, wt447);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(6+16*c37)+(ptrdiff_t)3072, 4032>>cut17, wt448);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(7+16*c37)+(ptrdiff_t)3072, 4032>>cut17, wt449);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(8+16*c37)+(ptrdiff_t)3072, 4032>>cut17, wt450);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(9+16*c37)+(ptrdiff_t)3072, 4032>>cut17, wt451);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(10+16*c37)+(ptrdiff_t)3072, 4032>>cut17, wt452);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(11+16*c37)+(ptrdiff_t)3072, 4032>>cut17, wt453);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(12+16*c37)+(ptrdiff_t)3072, 4032>>cut17, wt454);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(13+16*c37)+(ptrdiff_t)3072, 4032>>cut17, wt455);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(14+16*c37)+(ptrdiff_t)3072, 4032>>cut17, wt456);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(15+16*c37)+(ptrdiff_t)3072, 4032>>cut17, wt457);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(16+16*c37)+(ptrdiff_t)3072, 4032>>cut17, wt458);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(1+16*c37)+(ptrdiff_t)6144, 258048>>cut17, wt443);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(2+16*c37)+(ptrdiff_t)6144, 258048>>cut17, wt444);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(3+16*c37)+(ptrdiff_t)6144, 258048>>cut17, wt445);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(4+16*c37)+(ptrdiff_t)6144, 258048>>cut17, wt446);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(5+16*c37)+(ptrdiff_t)6144, 258048>>cut17, wt447);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(6+16*c37)+(ptrdiff_t)6144, 258048>>cut17, wt448);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(7+16*c37)+(ptrdiff_t)6144, 258048>>cut17, wt449);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(8+16*c37)+(ptrdiff_t)6144, 258048>>cut17, wt450);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(9+16*c37)+(ptrdiff_t)6144, 258048>>cut17, wt451);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(10+16*c37)+(ptrdiff_t)6144, 258048>>cut17, wt452);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(11+16*c37)+(ptrdiff_t)6144, 258048>>cut17, wt453);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(12+16*c37)+(ptrdiff_t)6144, 258048>>cut17, wt454);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(13+16*c37)+(ptrdiff_t)6144, 258048>>cut17, wt455);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(14+16*c37)+(ptrdiff_t)6144, 258048>>cut17, wt456);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(15+16*c37)+(ptrdiff_t)6144, 258048>>cut17, wt457);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(16+16*c37)+(ptrdiff_t)6144, 258048>>cut17, wt458);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(1+16*c37)+(ptrdiff_t)9216, 65535-(262143>>cut17), wt443);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(2+16*c37)+(ptrdiff_t)9216, 65535-(262143>>cut17), wt444);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(3+16*c37)+(ptrdiff_t)9216, 65535-(262143>>cut17), wt445);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(4+16*c37)+(ptrdiff_t)9216, 65535-(262143>>cut17), wt446);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(5+16*c37)+(ptrdiff_t)9216, 65535-(262143>>cut17), wt447);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(6+16*c37)+(ptrdiff_t)9216, 65535-(262143>>cut17), wt448);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(7+16*c37)+(ptrdiff_t)9216, 65535-(262143>>cut17), wt449);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(8+16*c37)+(ptrdiff_t)9216, 65535-(262143>>cut17), wt450);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(9+16*c37)+(ptrdiff_t)9216, 65535-(262143>>cut17), wt451);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(10+16*c37)+(ptrdiff_t)9216, 65535-(262143>>cut17), wt452);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(11+16*c37)+(ptrdiff_t)9216, 65535-(262143>>cut17), wt453);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(12+16*c37)+(ptrdiff_t)9216, 65535-(262143>>cut17), wt454);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(13+16*c37)+(ptrdiff_t)9216, 65535-(262143>>cut17), wt455);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(14+16*c37)+(ptrdiff_t)9216, 65535-(262143>>cut17), wt456);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(15+16*c37)+(ptrdiff_t)9216, 65535-(262143>>cut17), wt457);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l35+4*cut17+24*(16+16*c37)+(ptrdiff_t)9216, 65535-(262143>>cut17), wt458);
}
}
}
} else {
ptrdiff_t k96 = 112;
ptrdiff_t l34 = (size_t)(0+k96)/6;
ptrdiff_t cut16 = (size_t)(0+k96)%6;
__m512 sum287 = _mm512_maskz_loadu_ps(65535, biasPtr10+512*i44+4*k96);
__m512i pmMul22 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd22 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo21 = _mm512_loadu_ps(bnPtr15+(ptrdiff_t)8*(k96+128*i44));
__m512 masHi21 = _mm512_maskz_loadu_ps(65535, bnPtr15+(ptrdiff_t)8*(k96+128*i44)+(ptrdiff_t)64);
__m512 postMul22 = _mm512_permutex2var_ps(masLo21, pmMul22, masHi21);
__m512 postAdd20 = _mm512_permutex2var_ps(masLo21, pmAdd22, masHi21);
sum287 = _mm512_fmadd_ps(sum287, postMul22, postAdd20);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*0+(ptrdiff_t)0, 63>>cut16, sum287);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*0+(ptrdiff_t)3072, 4032>>cut16, sum287);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*0+(ptrdiff_t)6144, 258048>>cut16, sum287);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+8*0+(ptrdiff_t)9216, 65535-(262143>>cut16), sum287);
ptrdiff_t c35 = 0;
for (; c35 != 8; ++c35) {
__m512 wt411 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k96+64*c35+(ptrdiff_t)0);
__m512 wt412 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k96+64*c35+(ptrdiff_t)512);
__m512 wt413 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k96+64*c35+(ptrdiff_t)1024);
__m512 wt414 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k96+64*c35+(ptrdiff_t)1536);
__m512 wt415 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k96+64*c35+(ptrdiff_t)2048);
__m512 wt416 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k96+64*c35+(ptrdiff_t)2560);
__m512 wt417 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k96+64*c35+(ptrdiff_t)3072);
__m512 wt418 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k96+64*c35+(ptrdiff_t)3584);
__m512 wt419 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k96+64*c35+(ptrdiff_t)4096);
__m512 wt420 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k96+64*c35+(ptrdiff_t)4608);
__m512 wt421 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k96+64*c35+(ptrdiff_t)5120);
__m512 wt422 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k96+64*c35+(ptrdiff_t)5632);
__m512 wt423 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k96+64*c35+(ptrdiff_t)6144);
__m512 wt424 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k96+64*c35+(ptrdiff_t)6656);
__m512 wt425 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k96+64*c35+(ptrdiff_t)7168);
__m512 wt426 = _mm512_maskz_loadu_ps(65535, wtPtr10+65536*i44+512*k96+64*c35+(ptrdiff_t)7680);
__m512 tmp6117 = _mm512_unpacklo_ps(wt411, wt412);
__m512 tmp6118 = _mm512_unpackhi_ps(wt411, wt412);
__m512 tmp6119 = _mm512_unpacklo_ps(wt413, wt414);
__m512 tmp6120 = _mm512_unpackhi_ps(wt413, wt414);
__m512 tmp6121 = _mm512_unpacklo_ps(wt415, wt416);
__m512 tmp6122 = _mm512_unpackhi_ps(wt415, wt416);
__m512 tmp6123 = _mm512_unpacklo_ps(wt417, wt418);
__m512 tmp6124 = _mm512_unpackhi_ps(wt417, wt418);
__m512 tmp6125 = _mm512_unpacklo_ps(wt419, wt420);
__m512 tmp6126 = _mm512_unpackhi_ps(wt419, wt420);
__m512 tmp6127 = _mm512_unpacklo_ps(wt421, wt422);
__m512 tmp6128 = _mm512_unpackhi_ps(wt421, wt422);
__m512 tmp6129 = _mm512_unpacklo_ps(wt423, wt424);
__m512 tmp6130 = _mm512_unpackhi_ps(wt423, wt424);
__m512 tmp6131 = _mm512_unpacklo_ps(wt425, wt426);
__m512 tmp6132 = _mm512_unpackhi_ps(wt425, wt426);
__m512 tmp6133 = _mm512_shuffle_ps(tmp6117, tmp6119, 68);
__m512 tmp6134 = _mm512_shuffle_ps(tmp6117, tmp6119, 238);
__m512 tmp6135 = _mm512_shuffle_ps(tmp6118, tmp6120, 68);
__m512 tmp6136 = _mm512_shuffle_ps(tmp6118, tmp6120, 238);
__m512 tmp6137 = _mm512_shuffle_ps(tmp6121, tmp6123, 68);
__m512 tmp6138 = _mm512_shuffle_ps(tmp6121, tmp6123, 238);
__m512 tmp6139 = _mm512_shuffle_ps(tmp6122, tmp6124, 68);
__m512 tmp6140 = _mm512_shuffle_ps(tmp6122, tmp6124, 238);
__m512 tmp6141 = _mm512_shuffle_ps(tmp6125, tmp6127, 68);
__m512 tmp6142 = _mm512_shuffle_ps(tmp6125, tmp6127, 238);
__m512 tmp6143 = _mm512_shuffle_ps(tmp6126, tmp6128, 68);
__m512 tmp6144 = _mm512_shuffle_ps(tmp6126, tmp6128, 238);
__m512 tmp6145 = _mm512_shuffle_ps(tmp6129, tmp6131, 68);
__m512 tmp6146 = _mm512_shuffle_ps(tmp6129, tmp6131, 238);
__m512 tmp6147 = _mm512_shuffle_ps(tmp6130, tmp6132, 68);
__m512 tmp6148 = _mm512_shuffle_ps(tmp6130, tmp6132, 238);
__m512 tmp6149 = _mm512_shuffle_f32x4(tmp6133, tmp6137, 136);
__m512 tmp6150 = _mm512_shuffle_f32x4(tmp6133, tmp6137, 221);
__m512 tmp6151 = _mm512_shuffle_f32x4(tmp6134, tmp6138, 136);
__m512 tmp6152 = _mm512_shuffle_f32x4(tmp6134, tmp6138, 221);
__m512 tmp6153 = _mm512_shuffle_f32x4(tmp6135, tmp6139, 136);
__m512 tmp6154 = _mm512_shuffle_f32x4(tmp6135, tmp6139, 221);
__m512 tmp6155 = _mm512_shuffle_f32x4(tmp6136, tmp6140, 136);
__m512 tmp6156 = _mm512_shuffle_f32x4(tmp6136, tmp6140, 221);
__m512 tmp6157 = _mm512_shuffle_f32x4(tmp6141, tmp6145, 136);
__m512 tmp6158 = _mm512_shuffle_f32x4(tmp6141, tmp6145, 221);
__m512 tmp6159 = _mm512_shuffle_f32x4(tmp6142, tmp6146, 136);
__m512 tmp6160 = _mm512_shuffle_f32x4(tmp6142, tmp6146, 221);
__m512 tmp6161 = _mm512_shuffle_f32x4(tmp6143, tmp6147, 136);
__m512 tmp6162 = _mm512_shuffle_f32x4(tmp6143, tmp6147, 221);
__m512 tmp6163 = _mm512_shuffle_f32x4(tmp6144, tmp6148, 136);
__m512 tmp6164 = _mm512_shuffle_f32x4(tmp6144, tmp6148, 221);
wt411 = _mm512_shuffle_f32x4(tmp6149, tmp6157, 136);
wt419 = _mm512_shuffle_f32x4(tmp6149, tmp6157, 221);
wt412 = _mm512_shuffle_f32x4(tmp6151, tmp6159, 136);
wt420 = _mm512_shuffle_f32x4(tmp6151, tmp6159, 221);
wt413 = _mm512_shuffle_f32x4(tmp6153, tmp6161, 136);
wt421 = _mm512_shuffle_f32x4(tmp6153, tmp6161, 221);
wt414 = _mm512_shuffle_f32x4(tmp6155, tmp6163, 136);
wt422 = _mm512_shuffle_f32x4(tmp6155, tmp6163, 221);
wt415 = _mm512_shuffle_f32x4(tmp6150, tmp6158, 136);
wt423 = _mm512_shuffle_f32x4(tmp6150, tmp6158, 221);
wt416 = _mm512_shuffle_f32x4(tmp6152, tmp6160, 136);
wt424 = _mm512_shuffle_f32x4(tmp6152, tmp6160, 221);
wt417 = _mm512_shuffle_f32x4(tmp6154, tmp6162, 136);
wt425 = _mm512_shuffle_f32x4(tmp6154, tmp6162, 221);
wt418 = _mm512_shuffle_f32x4(tmp6156, tmp6164, 136);
wt426 = _mm512_shuffle_f32x4(tmp6156, tmp6164, 221);
wt411 = _mm512_mul_ps(wt411, postMul22);
wt412 = _mm512_mul_ps(wt412, postMul22);
wt413 = _mm512_mul_ps(wt413, postMul22);
wt414 = _mm512_mul_ps(wt414, postMul22);
wt415 = _mm512_mul_ps(wt415, postMul22);
wt416 = _mm512_mul_ps(wt416, postMul22);
wt417 = _mm512_mul_ps(wt417, postMul22);
wt418 = _mm512_mul_ps(wt418, postMul22);
wt419 = _mm512_mul_ps(wt419, postMul22);
wt420 = _mm512_mul_ps(wt420, postMul22);
wt421 = _mm512_mul_ps(wt421, postMul22);
wt422 = _mm512_mul_ps(wt422, postMul22);
wt423 = _mm512_mul_ps(wt423, postMul22);
wt424 = _mm512_mul_ps(wt424, postMul22);
wt425 = _mm512_mul_ps(wt425, postMul22);
wt426 = _mm512_mul_ps(wt426, postMul22);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(1+16*c35)+(ptrdiff_t)0, 63>>cut16, wt411);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(2+16*c35)+(ptrdiff_t)0, 63>>cut16, wt412);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(3+16*c35)+(ptrdiff_t)0, 63>>cut16, wt413);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(4+16*c35)+(ptrdiff_t)0, 63>>cut16, wt414);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(5+16*c35)+(ptrdiff_t)0, 63>>cut16, wt415);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(6+16*c35)+(ptrdiff_t)0, 63>>cut16, wt416);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(7+16*c35)+(ptrdiff_t)0, 63>>cut16, wt417);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(8+16*c35)+(ptrdiff_t)0, 63>>cut16, wt418);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(9+16*c35)+(ptrdiff_t)0, 63>>cut16, wt419);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(10+16*c35)+(ptrdiff_t)0, 63>>cut16, wt420);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(11+16*c35)+(ptrdiff_t)0, 63>>cut16, wt421);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(12+16*c35)+(ptrdiff_t)0, 63>>cut16, wt422);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(13+16*c35)+(ptrdiff_t)0, 63>>cut16, wt423);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(14+16*c35)+(ptrdiff_t)0, 63>>cut16, wt424);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(15+16*c35)+(ptrdiff_t)0, 63>>cut16, wt425);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(16+16*c35)+(ptrdiff_t)0, 63>>cut16, wt426);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(1+16*c35)+(ptrdiff_t)3072, 4032>>cut16, wt411);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(2+16*c35)+(ptrdiff_t)3072, 4032>>cut16, wt412);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(3+16*c35)+(ptrdiff_t)3072, 4032>>cut16, wt413);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(4+16*c35)+(ptrdiff_t)3072, 4032>>cut16, wt414);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(5+16*c35)+(ptrdiff_t)3072, 4032>>cut16, wt415);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(6+16*c35)+(ptrdiff_t)3072, 4032>>cut16, wt416);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(7+16*c35)+(ptrdiff_t)3072, 4032>>cut16, wt417);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(8+16*c35)+(ptrdiff_t)3072, 4032>>cut16, wt418);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(9+16*c35)+(ptrdiff_t)3072, 4032>>cut16, wt419);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(10+16*c35)+(ptrdiff_t)3072, 4032>>cut16, wt420);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(11+16*c35)+(ptrdiff_t)3072, 4032>>cut16, wt421);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(12+16*c35)+(ptrdiff_t)3072, 4032>>cut16, wt422);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(13+16*c35)+(ptrdiff_t)3072, 4032>>cut16, wt423);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(14+16*c35)+(ptrdiff_t)3072, 4032>>cut16, wt424);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(15+16*c35)+(ptrdiff_t)3072, 4032>>cut16, wt425);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(16+16*c35)+(ptrdiff_t)3072, 4032>>cut16, wt426);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(1+16*c35)+(ptrdiff_t)6144, 258048>>cut16, wt411);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(2+16*c35)+(ptrdiff_t)6144, 258048>>cut16, wt412);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(3+16*c35)+(ptrdiff_t)6144, 258048>>cut16, wt413);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(4+16*c35)+(ptrdiff_t)6144, 258048>>cut16, wt414);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(5+16*c35)+(ptrdiff_t)6144, 258048>>cut16, wt415);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(6+16*c35)+(ptrdiff_t)6144, 258048>>cut16, wt416);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(7+16*c35)+(ptrdiff_t)6144, 258048>>cut16, wt417);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(8+16*c35)+(ptrdiff_t)6144, 258048>>cut16, wt418);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(9+16*c35)+(ptrdiff_t)6144, 258048>>cut16, wt419);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(10+16*c35)+(ptrdiff_t)6144, 258048>>cut16, wt420);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(11+16*c35)+(ptrdiff_t)6144, 258048>>cut16, wt421);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(12+16*c35)+(ptrdiff_t)6144, 258048>>cut16, wt422);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(13+16*c35)+(ptrdiff_t)6144, 258048>>cut16, wt423);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(14+16*c35)+(ptrdiff_t)6144, 258048>>cut16, wt424);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(15+16*c35)+(ptrdiff_t)6144, 258048>>cut16, wt425);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+24*(16+16*c35)+(ptrdiff_t)6144, 258048>>cut16, wt426);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+8*(1+16*c35)+(ptrdiff_t)9216, 65535-(262143>>cut16), wt411);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+8*(2+16*c35)+(ptrdiff_t)9216, 65535-(262143>>cut16), wt412);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+8*(3+16*c35)+(ptrdiff_t)9216, 65535-(262143>>cut16), wt413);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+8*(4+16*c35)+(ptrdiff_t)9216, 65535-(262143>>cut16), wt414);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+8*(5+16*c35)+(ptrdiff_t)9216, 65535-(262143>>cut16), wt415);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+8*(6+16*c35)+(ptrdiff_t)9216, 65535-(262143>>cut16), wt416);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+8*(7+16*c35)+(ptrdiff_t)9216, 65535-(262143>>cut16), wt417);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+8*(8+16*c35)+(ptrdiff_t)9216, 65535-(262143>>cut16), wt418);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+8*(9+16*c35)+(ptrdiff_t)9216, 65535-(262143>>cut16), wt419);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+8*(10+16*c35)+(ptrdiff_t)9216, 65535-(262143>>cut16), wt420);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+8*(11+16*c35)+(ptrdiff_t)9216, 65535-(262143>>cut16), wt421);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+8*(12+16*c35)+(ptrdiff_t)9216, 65535-(262143>>cut16), wt422);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+8*(13+16*c35)+(ptrdiff_t)9216, 65535-(262143>>cut16), wt423);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+8*(14+16*c35)+(ptrdiff_t)9216, 65535-(262143>>cut16), wt424);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+8*(15+16*c35)+(ptrdiff_t)9216, 65535-(262143>>cut16), wt425);
_mm512_mask_storeu_ps(arranged15+66048*i44+3096*l34+4*cut16+8*(16+16*c35)+(ptrdiff_t)9216, 65535-(262143>>cut16), wt426);
}
}
}
}
}

static void DenseNet121OneArrangeWts8(DenseNet121ThreaderTeam1* team45, char** tensors63) {
DenseNet121ThreaderTask1 task67;
task67.callee1 = DenseNet121OneArrangeWts8Callee1;
task67.any1 = tensors63;
task67.nd1 = 3;
task67.hull1[0] = 2;
task67.hull1[1] = 1;
task67.hull1[2] = 1;
DenseNet121ThreaderDo1(team45, &task67);
}

static void DenseNet121OneArrangeDats8Callee1(DenseNet121ThreaderTask1* task68, int64_t* pt39) {
char** tensors66 = task68->any1;
ptrdiff_t c38 = pt39[1];
char*restrict datPtr19 = tensors66[0]+(ptrdiff_t)0+(ptrdiff_t)2618560*0+(ptrdiff_t)401408*0;
char*restrict bnPtr16 = tensors66[1]+(ptrdiff_t)8*((ptrdiff_t)835*0+(ptrdiff_t)128*0);
char*restrict arranged16 = tensors66[2]+(ptrdiff_t)2618560*0+(ptrdiff_t)401408*0;
ptrdiff_t ii23 = 1;
for (ptrdiff_t i45 = 0; i45 < ii23; ++i45) {
ptrdiff_t j35 = 1*c38;
ptrdiff_t jj42 = j35+0;
for (; j35 != 12; ++j35) {
ptrdiff_t k98 = 0;
ptrdiff_t kk39 = k98+128;
for (; k98 < kk39; ++k98) {
__m512 dat1359 = _mm512_maskz_loadu_ps(65535, datPtr19+401408*i45+256*j35+3136*k98+(ptrdiff_t)0);
__m512 dat1360 = _mm512_maskz_loadu_ps(65535, datPtr19+401408*i45+256*j35+3136*k98+(ptrdiff_t)64);
__m512 dat1361 = _mm512_maskz_loadu_ps(65535, datPtr19+401408*i45+256*j35+3136*k98+(ptrdiff_t)128);
__m512 dat1362 = _mm512_maskz_loadu_ps(65535, datPtr19+401408*i45+256*j35+3136*k98+(ptrdiff_t)192);
__m512 bnMul8 = _mm512_set1_ps(((float*)bnPtr16+(ptrdiff_t)2*(k98+128*i45))[0]);
__m512 bnAdd8 = _mm512_set1_ps(((float*)bnPtr16+(ptrdiff_t)2*(k98+128*i45))[1]);
dat1359 = _mm512_fmadd_ps(dat1359, bnMul8, bnAdd8);
dat1360 = _mm512_fmadd_ps(dat1360, bnMul8, bnAdd8);
dat1361 = _mm512_fmadd_ps(dat1361, bnMul8, bnAdd8);
dat1362 = _mm512_fmadd_ps(dat1362, bnMul8, bnAdd8);
dat1359 = _mm512_max_ps(_mm512_setzero_ps(), dat1359);
dat1360 = _mm512_max_ps(_mm512_setzero_ps(), dat1360);
dat1361 = _mm512_max_ps(_mm512_setzero_ps(), dat1361);
dat1362 = _mm512_max_ps(_mm512_setzero_ps(), dat1362);
_mm512_mask_storeu_ps(arranged16+401408*i45+32768*j35+256*k98+(ptrdiff_t)0, 65535, dat1359);
_mm512_mask_storeu_ps(arranged16+401408*i45+32768*j35+256*k98+(ptrdiff_t)64, 65535, dat1360);
_mm512_mask_storeu_ps(arranged16+401408*i45+32768*j35+256*k98+(ptrdiff_t)128, 65535, dat1361);
_mm512_mask_storeu_ps(arranged16+401408*i45+32768*j35+256*k98+(ptrdiff_t)192, 65535, dat1362);
}
if (j35 >= jj42) goto next8;
}
ptrdiff_t k99 = 0;
ptrdiff_t kk40 = k99+128;
for (; k99 < kk40; ++k99) {
__m512 dat1363 = _mm512_maskz_loadu_ps(65535, datPtr19+401408*i45+256*j35+3136*k99+(ptrdiff_t)0);
__m512 bnMul9 = _mm512_set1_ps(((float*)bnPtr16+(ptrdiff_t)2*(k99+128*i45))[0]);
__m512 bnAdd9 = _mm512_set1_ps(((float*)bnPtr16+(ptrdiff_t)2*(k99+128*i45))[1]);
dat1363 = _mm512_fmadd_ps(dat1363, bnMul9, bnAdd9);
dat1363 = _mm512_max_ps(_mm512_setzero_ps(), dat1363);
_mm512_mask_storeu_ps(arranged16+401408*i45+32768*j35+64*k99+(ptrdiff_t)0, 65535, dat1363);
}
next8:;
}
}

static void DenseNet121OneArrangeDats8(DenseNet121ThreaderTeam1* team46, char** tensors65) {
DenseNet121ThreaderTask1 task69;
task69.callee1 = DenseNet121OneArrangeDats8Callee1;
task69.any1 = tensors65;
task69.nd1 = 4;
task69.hull1[0] = 1;
task69.hull1[1] = 13;
task69.hull1[2] = 1;
task69.hull1[3] = 1;
DenseNet121ThreaderDo1(team46, &task69);
}

static void DenseNet121OneApply8Callee1(DenseNet121ThreaderTask1* task70, int64_t* pt40) {
void** pair18 = task70->any1;
char** tensors68 = pair18[0];
ptrdiff_t e16 = 0;
ptrdiff_t g17 = 0;
ptrdiff_t d12 = pt40[1];
ptrdiff_t w40 = pt40[0];
char*restrict arrangedWts8 = tensors68[0]+428032*e16+(ptrdiff_t)66048*1*g17;
char*restrict arrangedDats8 = tensors68[1]+2618560*e16+(ptrdiff_t)401408*1*g17;
char*restrict datPtr20 = tensors68[2]+(ptrdiff_t)401408*1*g17;
ptrdiff_t ii24 = 1;
for (ptrdiff_t i46 = 0; i46 < ii24; ++i46) {
ptrdiff_t j36 = 1*d12;
ptrdiff_t jj43 = j36+0;
for (; j36 != 12; ++j36) {
ptrdiff_t k100 = 4*w40;
ptrdiff_t kk41 = k100+(w40 < 4 ? 3 : 5);
for (; k100 != 21; ++k100) {
ptrdiff_t s27 = -1;
__m512 sum290 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k100+24*s27+(ptrdiff_t)24));
__m512 sum294 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k100+24*s27+(ptrdiff_t)28));
__m512 sum298 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k100+24*s27+(ptrdiff_t)32));
__m512 sum302 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k100+24*s27+(ptrdiff_t)36));
__m512 sum306 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k100+24*s27+(ptrdiff_t)40));
__m512 sum310 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k100+24*s27+(ptrdiff_t)44));
__m512 sum291 = sum290;
__m512 sum292 = sum290;
__m512 sum293 = sum290;
__m512 sum295 = sum294;
__m512 sum296 = sum294;
__m512 sum297 = sum294;
__m512 sum299 = sum298;
__m512 sum300 = sum298;
__m512 sum301 = sum298;
__m512 sum303 = sum302;
__m512 sum304 = sum302;
__m512 sum305 = sum302;
__m512 sum307 = sum306;
__m512 sum308 = sum306;
__m512 sum309 = sum306;
__m512 sum311 = sum310;
__m512 sum312 = sum310;
__m512 sum313 = sum310;
for (s27 = 0; s27 < 128; ++s27) {
__m512 dat1364 = _mm512_loadu_ps(arrangedDats8+401408*i46+32768*j36+256*s27+(ptrdiff_t)0);
__m512 dat1365 = _mm512_loadu_ps(arrangedDats8+401408*i46+32768*j36+256*s27+(ptrdiff_t)64);
__m512 dat1366 = _mm512_loadu_ps(arrangedDats8+401408*i46+32768*j36+256*s27+(ptrdiff_t)128);
__m512 dat1367 = _mm512_loadu_ps(arrangedDats8+401408*i46+32768*j36+256*s27+(ptrdiff_t)192);
__m512 wt459 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k100+24*s27+(ptrdiff_t)24));
sum290 = _mm512_fmadd_ps(wt459, dat1364, sum290);
sum291 = _mm512_fmadd_ps(wt459, dat1365, sum291);
sum292 = _mm512_fmadd_ps(wt459, dat1366, sum292);
sum293 = _mm512_fmadd_ps(wt459, dat1367, sum293);
__m512 wt460 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k100+24*s27+(ptrdiff_t)28));
sum294 = _mm512_fmadd_ps(wt460, dat1364, sum294);
sum295 = _mm512_fmadd_ps(wt460, dat1365, sum295);
sum296 = _mm512_fmadd_ps(wt460, dat1366, sum296);
sum297 = _mm512_fmadd_ps(wt460, dat1367, sum297);
__m512 wt461 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k100+24*s27+(ptrdiff_t)32));
sum298 = _mm512_fmadd_ps(wt461, dat1364, sum298);
sum299 = _mm512_fmadd_ps(wt461, dat1365, sum299);
sum300 = _mm512_fmadd_ps(wt461, dat1366, sum300);
sum301 = _mm512_fmadd_ps(wt461, dat1367, sum301);
__m512 wt462 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k100+24*s27+(ptrdiff_t)36));
sum302 = _mm512_fmadd_ps(wt462, dat1364, sum302);
sum303 = _mm512_fmadd_ps(wt462, dat1365, sum303);
sum304 = _mm512_fmadd_ps(wt462, dat1366, sum304);
sum305 = _mm512_fmadd_ps(wt462, dat1367, sum305);
__m512 wt463 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k100+24*s27+(ptrdiff_t)40));
sum306 = _mm512_fmadd_ps(wt463, dat1364, sum306);
sum307 = _mm512_fmadd_ps(wt463, dat1365, sum307);
sum308 = _mm512_fmadd_ps(wt463, dat1366, sum308);
sum309 = _mm512_fmadd_ps(wt463, dat1367, sum309);
__m512 wt464 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k100+24*s27+(ptrdiff_t)44));
sum310 = _mm512_fmadd_ps(wt464, dat1364, sum310);
sum311 = _mm512_fmadd_ps(wt464, dat1365, sum311);
sum312 = _mm512_fmadd_ps(wt464, dat1366, sum312);
sum313 = _mm512_fmadd_ps(wt464, dat1367, sum313);
}
sum290 = _mm512_max_ps(_mm512_setzero_ps(), sum290);
sum291 = _mm512_max_ps(_mm512_setzero_ps(), sum291);
sum292 = _mm512_max_ps(_mm512_setzero_ps(), sum292);
sum293 = _mm512_max_ps(_mm512_setzero_ps(), sum293);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)0, 65535, sum290);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)64, 65535, sum291);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)128, 65535, sum292);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)192, 65535, sum293);
sum294 = _mm512_max_ps(_mm512_setzero_ps(), sum294);
sum295 = _mm512_max_ps(_mm512_setzero_ps(), sum295);
sum296 = _mm512_max_ps(_mm512_setzero_ps(), sum296);
sum297 = _mm512_max_ps(_mm512_setzero_ps(), sum297);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)3136, 65535, sum294);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)3200, 65535, sum295);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)3264, 65535, sum296);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)3328, 65535, sum297);
sum298 = _mm512_max_ps(_mm512_setzero_ps(), sum298);
sum299 = _mm512_max_ps(_mm512_setzero_ps(), sum299);
sum300 = _mm512_max_ps(_mm512_setzero_ps(), sum300);
sum301 = _mm512_max_ps(_mm512_setzero_ps(), sum301);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)6272, 65535, sum298);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)6336, 65535, sum299);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)6400, 65535, sum300);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)6464, 65535, sum301);
sum302 = _mm512_max_ps(_mm512_setzero_ps(), sum302);
sum303 = _mm512_max_ps(_mm512_setzero_ps(), sum303);
sum304 = _mm512_max_ps(_mm512_setzero_ps(), sum304);
sum305 = _mm512_max_ps(_mm512_setzero_ps(), sum305);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)9408, 65535, sum302);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)9472, 65535, sum303);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)9536, 65535, sum304);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)9600, 65535, sum305);
sum306 = _mm512_max_ps(_mm512_setzero_ps(), sum306);
sum307 = _mm512_max_ps(_mm512_setzero_ps(), sum307);
sum308 = _mm512_max_ps(_mm512_setzero_ps(), sum308);
sum309 = _mm512_max_ps(_mm512_setzero_ps(), sum309);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)12544, 65535, sum306);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)12608, 65535, sum307);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)12672, 65535, sum308);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)12736, 65535, sum309);
sum310 = _mm512_max_ps(_mm512_setzero_ps(), sum310);
sum311 = _mm512_max_ps(_mm512_setzero_ps(), sum311);
sum312 = _mm512_max_ps(_mm512_setzero_ps(), sum312);
sum313 = _mm512_max_ps(_mm512_setzero_ps(), sum313);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)15680, 65535, sum310);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)15744, 65535, sum311);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)15808, 65535, sum312);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)15872, 65535, sum313);
if (k100 >= kk41) return;
}
ptrdiff_t s28 = -1;
__m512 sum314 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k100+8*s28+(ptrdiff_t)8));
__m512 sum318 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k100+8*s28+(ptrdiff_t)12));
__m512 sum315 = sum314;
__m512 sum316 = sum314;
__m512 sum317 = sum314;
__m512 sum319 = sum318;
__m512 sum320 = sum318;
__m512 sum321 = sum318;
for (s28 = 0; s28 < 128; ++s28) {
__m512 dat1368 = _mm512_loadu_ps(arrangedDats8+401408*i46+32768*j36+256*s28+(ptrdiff_t)0);
__m512 dat1369 = _mm512_loadu_ps(arrangedDats8+401408*i46+32768*j36+256*s28+(ptrdiff_t)64);
__m512 dat1370 = _mm512_loadu_ps(arrangedDats8+401408*i46+32768*j36+256*s28+(ptrdiff_t)128);
__m512 dat1371 = _mm512_loadu_ps(arrangedDats8+401408*i46+32768*j36+256*s28+(ptrdiff_t)192);
__m512 wt465 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k100+8*s28+(ptrdiff_t)8));
sum314 = _mm512_fmadd_ps(wt465, dat1368, sum314);
sum315 = _mm512_fmadd_ps(wt465, dat1369, sum315);
sum316 = _mm512_fmadd_ps(wt465, dat1370, sum316);
sum317 = _mm512_fmadd_ps(wt465, dat1371, sum317);
__m512 wt466 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k100+8*s28+(ptrdiff_t)12));
sum318 = _mm512_fmadd_ps(wt466, dat1368, sum318);
sum319 = _mm512_fmadd_ps(wt466, dat1369, sum319);
sum320 = _mm512_fmadd_ps(wt466, dat1370, sum320);
sum321 = _mm512_fmadd_ps(wt466, dat1371, sum321);
}
sum314 = _mm512_max_ps(_mm512_setzero_ps(), sum314);
sum315 = _mm512_max_ps(_mm512_setzero_ps(), sum315);
sum316 = _mm512_max_ps(_mm512_setzero_ps(), sum316);
sum317 = _mm512_max_ps(_mm512_setzero_ps(), sum317);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)0, 65535, sum314);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)64, 65535, sum315);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)128, 65535, sum316);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)192, 65535, sum317);
sum318 = _mm512_max_ps(_mm512_setzero_ps(), sum318);
sum319 = _mm512_max_ps(_mm512_setzero_ps(), sum319);
sum320 = _mm512_max_ps(_mm512_setzero_ps(), sum320);
sum321 = _mm512_max_ps(_mm512_setzero_ps(), sum321);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)3136, 65535, sum318);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)3200, 65535, sum319);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)3264, 65535, sum320);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k100+(ptrdiff_t)3328, 65535, sum321);
if (j36 >= jj43) return;
}
ptrdiff_t k101 = 4*w40;
ptrdiff_t kk42 = k101+(w40 < 4 ? 3 : 5);
for (; k101 != 21; ++k101) {
ptrdiff_t s29 = -1;
__m512 sum322 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k101+24*s29+(ptrdiff_t)24));
__m512 sum323 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k101+24*s29+(ptrdiff_t)28));
__m512 sum324 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k101+24*s29+(ptrdiff_t)32));
__m512 sum325 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k101+24*s29+(ptrdiff_t)36));
__m512 sum326 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k101+24*s29+(ptrdiff_t)40));
__m512 sum327 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k101+24*s29+(ptrdiff_t)44));
for (s29 = 0; s29 < 128; ++s29) {
__m512 dat1372 = _mm512_loadu_ps(arrangedDats8+401408*i46+32768*j36+64*s29+(ptrdiff_t)0);
__m512 wt467 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k101+24*s29+(ptrdiff_t)24));
sum322 = _mm512_fmadd_ps(wt467, dat1372, sum322);
__m512 wt468 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k101+24*s29+(ptrdiff_t)28));
sum323 = _mm512_fmadd_ps(wt468, dat1372, sum323);
__m512 wt469 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k101+24*s29+(ptrdiff_t)32));
sum324 = _mm512_fmadd_ps(wt469, dat1372, sum324);
__m512 wt470 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k101+24*s29+(ptrdiff_t)36));
sum325 = _mm512_fmadd_ps(wt470, dat1372, sum325);
__m512 wt471 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k101+24*s29+(ptrdiff_t)40));
sum326 = _mm512_fmadd_ps(wt471, dat1372, sum326);
__m512 wt472 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k101+24*s29+(ptrdiff_t)44));
sum327 = _mm512_fmadd_ps(wt472, dat1372, sum327);
}
sum322 = _mm512_max_ps(_mm512_setzero_ps(), sum322);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k101+(ptrdiff_t)0, 65535, sum322);
sum323 = _mm512_max_ps(_mm512_setzero_ps(), sum323);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k101+(ptrdiff_t)3136, 65535, sum323);
sum324 = _mm512_max_ps(_mm512_setzero_ps(), sum324);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k101+(ptrdiff_t)6272, 65535, sum324);
sum325 = _mm512_max_ps(_mm512_setzero_ps(), sum325);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k101+(ptrdiff_t)9408, 65535, sum325);
sum326 = _mm512_max_ps(_mm512_setzero_ps(), sum326);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k101+(ptrdiff_t)12544, 65535, sum326);
sum327 = _mm512_max_ps(_mm512_setzero_ps(), sum327);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k101+(ptrdiff_t)15680, 65535, sum327);
if (k101 >= kk42) return;
}
ptrdiff_t s30 = -1;
__m512 sum328 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k101+8*s30+(ptrdiff_t)8));
__m512 sum329 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k101+8*s30+(ptrdiff_t)12));
for (s30 = 0; s30 < 128; ++s30) {
__m512 dat1373 = _mm512_loadu_ps(arrangedDats8+401408*i46+32768*j36+64*s30+(ptrdiff_t)0);
__m512 wt473 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k101+8*s30+(ptrdiff_t)8));
sum328 = _mm512_fmadd_ps(wt473, dat1373, sum328);
__m512 wt474 = _mm512_set1_ps(*(float*)(arrangedWts8+66048*i46+3096*k101+8*s30+(ptrdiff_t)12));
sum329 = _mm512_fmadd_ps(wt474, dat1373, sum329);
}
sum328 = _mm512_max_ps(_mm512_setzero_ps(), sum328);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k101+(ptrdiff_t)0, 65535, sum328);
sum329 = _mm512_max_ps(_mm512_setzero_ps(), sum329);
_mm512_mask_storeu_ps(datPtr20+401408*i46+256*j36+18816*k101+(ptrdiff_t)3136, 65535, sum329);
}
}

static void DenseNet121OneApply8(DenseNet121ThreaderTeam1* team47, char** tensors67) {
void* pair17[] = {tensors67, 0};
DenseNet121ThreaderTask1 task71;
task71.callee1 = DenseNet121OneApply8Callee1;
task71.any1 = pair17;
task71.nd1 = 3;
task71.hull1[0] = 5;
task71.hull1[1] = 13;
task71.hull1[2] = 1;
DenseNet121ThreaderDo1(team47, &task71);
}

static void DenseNet121OneArrangeWts9Callee1(DenseNet121ThreaderTask1* task80, int64_t* pt45) {
char** tensors78 = task80->any1;
ptrdiff_t b57 = pt45[0];
char*restrict wtPtr12 = tensors78[0]+(ptrdiff_t)3340*0+(ptrdiff_t)81920*0;
char*restrict biasPtr12 = tensors78[1]+(ptrdiff_t)512*0;
char*restrict bnPtr17 = tensors78[2]+(ptrdiff_t)8*128*0;
char*restrict arranged17 = tensors78[3]+(ptrdiff_t)428032*0+(ptrdiff_t)82432*0;
ptrdiff_t ii25 = 1;
for (ptrdiff_t i51 = 0; i51 < ii25; ++i51) {
ptrdiff_t j41 = 4*b57;
ptrdiff_t jj45 = j41+4;
for (; j41 < jj45; ++j41) {
if (j41 < 7) {
ptrdiff_t k115 = 0+16*(j41-0);
ptrdiff_t l44 = (size_t)(0+k115)/6;
ptrdiff_t cut20 = (size_t)(0+k115)%6;
switch (cut20) {
case 0:;
case 2: {
__m512 sum359 = _mm512_maskz_loadu_ps(65535, biasPtr12+512*i51+4*k115);
__m512i pmMul23 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd23 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo22 = _mm512_loadu_ps(bnPtr17+(ptrdiff_t)8*(k115+128*i51));
__m512 masHi22 = _mm512_maskz_loadu_ps(65535, bnPtr17+(ptrdiff_t)8*(k115+128*i51)+(ptrdiff_t)64);
__m512 postMul26 = _mm512_permutex2var_ps(masLo22, pmMul23, masHi22);
__m512 postAdd24 = _mm512_permutex2var_ps(masLo22, pmAdd23, masHi22);
sum359 = _mm512_fmadd_ps(sum359, postMul26, postAdd24);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*0+(ptrdiff_t)0, 63>>cut20, sum359);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*0+(ptrdiff_t)3840, 4032>>cut20, sum359);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*0+(ptrdiff_t)7680, 65535-(4095>>cut20), sum359);
ptrdiff_t c41 = 0;
for (; c41 != 10; ++c41) {
__m512 wt495 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c41+(ptrdiff_t)0);
__m512 wt496 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c41+(ptrdiff_t)640);
__m512 wt497 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c41+(ptrdiff_t)1280);
__m512 wt498 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c41+(ptrdiff_t)1920);
__m512 wt499 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c41+(ptrdiff_t)2560);
__m512 wt500 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c41+(ptrdiff_t)3200);
__m512 wt501 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c41+(ptrdiff_t)3840);
__m512 wt502 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c41+(ptrdiff_t)4480);
__m512 wt503 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c41+(ptrdiff_t)5120);
__m512 wt504 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c41+(ptrdiff_t)5760);
__m512 wt505 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c41+(ptrdiff_t)6400);
__m512 wt506 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c41+(ptrdiff_t)7040);
__m512 wt507 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c41+(ptrdiff_t)7680);
__m512 wt508 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c41+(ptrdiff_t)8320);
__m512 wt509 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c41+(ptrdiff_t)8960);
__m512 wt510 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c41+(ptrdiff_t)9600);
__m512 tmp8783 = _mm512_unpacklo_ps(wt495, wt496);
__m512 tmp8784 = _mm512_unpackhi_ps(wt495, wt496);
__m512 tmp8785 = _mm512_unpacklo_ps(wt497, wt498);
__m512 tmp8786 = _mm512_unpackhi_ps(wt497, wt498);
__m512 tmp8787 = _mm512_unpacklo_ps(wt499, wt500);
__m512 tmp8788 = _mm512_unpackhi_ps(wt499, wt500);
__m512 tmp8789 = _mm512_unpacklo_ps(wt501, wt502);
__m512 tmp8790 = _mm512_unpackhi_ps(wt501, wt502);
__m512 tmp8791 = _mm512_unpacklo_ps(wt503, wt504);
__m512 tmp8792 = _mm512_unpackhi_ps(wt503, wt504);
__m512 tmp8793 = _mm512_unpacklo_ps(wt505, wt506);
__m512 tmp8794 = _mm512_unpackhi_ps(wt505, wt506);
__m512 tmp8795 = _mm512_unpacklo_ps(wt507, wt508);
__m512 tmp8796 = _mm512_unpackhi_ps(wt507, wt508);
__m512 tmp8797 = _mm512_unpacklo_ps(wt509, wt510);
__m512 tmp8798 = _mm512_unpackhi_ps(wt509, wt510);
__m512 tmp8799 = _mm512_shuffle_ps(tmp8783, tmp8785, 68);
__m512 tmp8800 = _mm512_shuffle_ps(tmp8783, tmp8785, 238);
__m512 tmp8801 = _mm512_shuffle_ps(tmp8784, tmp8786, 68);
__m512 tmp8802 = _mm512_shuffle_ps(tmp8784, tmp8786, 238);
__m512 tmp8803 = _mm512_shuffle_ps(tmp8787, tmp8789, 68);
__m512 tmp8804 = _mm512_shuffle_ps(tmp8787, tmp8789, 238);
__m512 tmp8805 = _mm512_shuffle_ps(tmp8788, tmp8790, 68);
__m512 tmp8806 = _mm512_shuffle_ps(tmp8788, tmp8790, 238);
__m512 tmp8807 = _mm512_shuffle_ps(tmp8791, tmp8793, 68);
__m512 tmp8808 = _mm512_shuffle_ps(tmp8791, tmp8793, 238);
__m512 tmp8809 = _mm512_shuffle_ps(tmp8792, tmp8794, 68);
__m512 tmp8810 = _mm512_shuffle_ps(tmp8792, tmp8794, 238);
__m512 tmp8811 = _mm512_shuffle_ps(tmp8795, tmp8797, 68);
__m512 tmp8812 = _mm512_shuffle_ps(tmp8795, tmp8797, 238);
__m512 tmp8813 = _mm512_shuffle_ps(tmp8796, tmp8798, 68);
__m512 tmp8814 = _mm512_shuffle_ps(tmp8796, tmp8798, 238);
__m512 tmp8815 = _mm512_shuffle_f32x4(tmp8799, tmp8803, 136);
__m512 tmp8816 = _mm512_shuffle_f32x4(tmp8799, tmp8803, 221);
__m512 tmp8817 = _mm512_shuffle_f32x4(tmp8800, tmp8804, 136);
__m512 tmp8818 = _mm512_shuffle_f32x4(tmp8800, tmp8804, 221);
__m512 tmp8819 = _mm512_shuffle_f32x4(tmp8801, tmp8805, 136);
__m512 tmp8820 = _mm512_shuffle_f32x4(tmp8801, tmp8805, 221);
__m512 tmp8821 = _mm512_shuffle_f32x4(tmp8802, tmp8806, 136);
__m512 tmp8822 = _mm512_shuffle_f32x4(tmp8802, tmp8806, 221);
__m512 tmp8823 = _mm512_shuffle_f32x4(tmp8807, tmp8811, 136);
__m512 tmp8824 = _mm512_shuffle_f32x4(tmp8807, tmp8811, 221);
__m512 tmp8825 = _mm512_shuffle_f32x4(tmp8808, tmp8812, 136);
__m512 tmp8826 = _mm512_shuffle_f32x4(tmp8808, tmp8812, 221);
__m512 tmp8827 = _mm512_shuffle_f32x4(tmp8809, tmp8813, 136);
__m512 tmp8828 = _mm512_shuffle_f32x4(tmp8809, tmp8813, 221);
__m512 tmp8829 = _mm512_shuffle_f32x4(tmp8810, tmp8814, 136);
__m512 tmp8830 = _mm512_shuffle_f32x4(tmp8810, tmp8814, 221);
wt495 = _mm512_shuffle_f32x4(tmp8815, tmp8823, 136);
wt503 = _mm512_shuffle_f32x4(tmp8815, tmp8823, 221);
wt496 = _mm512_shuffle_f32x4(tmp8817, tmp8825, 136);
wt504 = _mm512_shuffle_f32x4(tmp8817, tmp8825, 221);
wt497 = _mm512_shuffle_f32x4(tmp8819, tmp8827, 136);
wt505 = _mm512_shuffle_f32x4(tmp8819, tmp8827, 221);
wt498 = _mm512_shuffle_f32x4(tmp8821, tmp8829, 136);
wt506 = _mm512_shuffle_f32x4(tmp8821, tmp8829, 221);
wt499 = _mm512_shuffle_f32x4(tmp8816, tmp8824, 136);
wt507 = _mm512_shuffle_f32x4(tmp8816, tmp8824, 221);
wt500 = _mm512_shuffle_f32x4(tmp8818, tmp8826, 136);
wt508 = _mm512_shuffle_f32x4(tmp8818, tmp8826, 221);
wt501 = _mm512_shuffle_f32x4(tmp8820, tmp8828, 136);
wt509 = _mm512_shuffle_f32x4(tmp8820, tmp8828, 221);
wt502 = _mm512_shuffle_f32x4(tmp8822, tmp8830, 136);
wt510 = _mm512_shuffle_f32x4(tmp8822, tmp8830, 221);
wt495 = _mm512_mul_ps(wt495, postMul26);
wt496 = _mm512_mul_ps(wt496, postMul26);
wt497 = _mm512_mul_ps(wt497, postMul26);
wt498 = _mm512_mul_ps(wt498, postMul26);
wt499 = _mm512_mul_ps(wt499, postMul26);
wt500 = _mm512_mul_ps(wt500, postMul26);
wt501 = _mm512_mul_ps(wt501, postMul26);
wt502 = _mm512_mul_ps(wt502, postMul26);
wt503 = _mm512_mul_ps(wt503, postMul26);
wt504 = _mm512_mul_ps(wt504, postMul26);
wt505 = _mm512_mul_ps(wt505, postMul26);
wt506 = _mm512_mul_ps(wt506, postMul26);
wt507 = _mm512_mul_ps(wt507, postMul26);
wt508 = _mm512_mul_ps(wt508, postMul26);
wt509 = _mm512_mul_ps(wt509, postMul26);
wt510 = _mm512_mul_ps(wt510, postMul26);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(1+16*c41)+(ptrdiff_t)0, 63>>cut20, wt495);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(2+16*c41)+(ptrdiff_t)0, 63>>cut20, wt496);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(3+16*c41)+(ptrdiff_t)0, 63>>cut20, wt497);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(4+16*c41)+(ptrdiff_t)0, 63>>cut20, wt498);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(5+16*c41)+(ptrdiff_t)0, 63>>cut20, wt499);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(6+16*c41)+(ptrdiff_t)0, 63>>cut20, wt500);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(7+16*c41)+(ptrdiff_t)0, 63>>cut20, wt501);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(8+16*c41)+(ptrdiff_t)0, 63>>cut20, wt502);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(9+16*c41)+(ptrdiff_t)0, 63>>cut20, wt503);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(10+16*c41)+(ptrdiff_t)0, 63>>cut20, wt504);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(11+16*c41)+(ptrdiff_t)0, 63>>cut20, wt505);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(12+16*c41)+(ptrdiff_t)0, 63>>cut20, wt506);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(13+16*c41)+(ptrdiff_t)0, 63>>cut20, wt507);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(14+16*c41)+(ptrdiff_t)0, 63>>cut20, wt508);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(15+16*c41)+(ptrdiff_t)0, 63>>cut20, wt509);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(16+16*c41)+(ptrdiff_t)0, 63>>cut20, wt510);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(1+16*c41)+(ptrdiff_t)3840, 4032>>cut20, wt495);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(2+16*c41)+(ptrdiff_t)3840, 4032>>cut20, wt496);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(3+16*c41)+(ptrdiff_t)3840, 4032>>cut20, wt497);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(4+16*c41)+(ptrdiff_t)3840, 4032>>cut20, wt498);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(5+16*c41)+(ptrdiff_t)3840, 4032>>cut20, wt499);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(6+16*c41)+(ptrdiff_t)3840, 4032>>cut20, wt500);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(7+16*c41)+(ptrdiff_t)3840, 4032>>cut20, wt501);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(8+16*c41)+(ptrdiff_t)3840, 4032>>cut20, wt502);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(9+16*c41)+(ptrdiff_t)3840, 4032>>cut20, wt503);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(10+16*c41)+(ptrdiff_t)3840, 4032>>cut20, wt504);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(11+16*c41)+(ptrdiff_t)3840, 4032>>cut20, wt505);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(12+16*c41)+(ptrdiff_t)3840, 4032>>cut20, wt506);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(13+16*c41)+(ptrdiff_t)3840, 4032>>cut20, wt507);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(14+16*c41)+(ptrdiff_t)3840, 4032>>cut20, wt508);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(15+16*c41)+(ptrdiff_t)3840, 4032>>cut20, wt509);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(16+16*c41)+(ptrdiff_t)3840, 4032>>cut20, wt510);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(1+16*c41)+(ptrdiff_t)7680, 65535-(4095>>cut20), wt495);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(2+16*c41)+(ptrdiff_t)7680, 65535-(4095>>cut20), wt496);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(3+16*c41)+(ptrdiff_t)7680, 65535-(4095>>cut20), wt497);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(4+16*c41)+(ptrdiff_t)7680, 65535-(4095>>cut20), wt498);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(5+16*c41)+(ptrdiff_t)7680, 65535-(4095>>cut20), wt499);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(6+16*c41)+(ptrdiff_t)7680, 65535-(4095>>cut20), wt500);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(7+16*c41)+(ptrdiff_t)7680, 65535-(4095>>cut20), wt501);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(8+16*c41)+(ptrdiff_t)7680, 65535-(4095>>cut20), wt502);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(9+16*c41)+(ptrdiff_t)7680, 65535-(4095>>cut20), wt503);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(10+16*c41)+(ptrdiff_t)7680, 65535-(4095>>cut20), wt504);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(11+16*c41)+(ptrdiff_t)7680, 65535-(4095>>cut20), wt505);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(12+16*c41)+(ptrdiff_t)7680, 65535-(4095>>cut20), wt506);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(13+16*c41)+(ptrdiff_t)7680, 65535-(4095>>cut20), wt507);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(14+16*c41)+(ptrdiff_t)7680, 65535-(4095>>cut20), wt508);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(15+16*c41)+(ptrdiff_t)7680, 65535-(4095>>cut20), wt509);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(16+16*c41)+(ptrdiff_t)7680, 65535-(4095>>cut20), wt510);
}
break;
}
default: {
cut20 = 4;
__m512 sum360 = _mm512_maskz_loadu_ps(65535, biasPtr12+512*i51+4*k115);
__m512i pmMul24 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd24 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo23 = _mm512_loadu_ps(bnPtr17+(ptrdiff_t)8*(k115+128*i51));
__m512 masHi23 = _mm512_maskz_loadu_ps(65535, bnPtr17+(ptrdiff_t)8*(k115+128*i51)+(ptrdiff_t)64);
__m512 postMul27 = _mm512_permutex2var_ps(masLo23, pmMul24, masHi23);
__m512 postAdd25 = _mm512_permutex2var_ps(masLo23, pmAdd24, masHi23);
sum360 = _mm512_fmadd_ps(sum360, postMul27, postAdd25);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*0+(ptrdiff_t)0, 63>>cut20, sum360);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*0+(ptrdiff_t)3840, 4032>>cut20, sum360);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*0+(ptrdiff_t)7680, 258048>>cut20, sum360);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*0+(ptrdiff_t)11520, 65535-(262143>>cut20), sum360);
ptrdiff_t c42 = 0;
for (; c42 != 10; ++c42) {
__m512 wt511 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c42+(ptrdiff_t)0);
__m512 wt512 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c42+(ptrdiff_t)640);
__m512 wt513 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c42+(ptrdiff_t)1280);
__m512 wt514 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c42+(ptrdiff_t)1920);
__m512 wt515 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c42+(ptrdiff_t)2560);
__m512 wt516 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c42+(ptrdiff_t)3200);
__m512 wt517 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c42+(ptrdiff_t)3840);
__m512 wt518 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c42+(ptrdiff_t)4480);
__m512 wt519 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c42+(ptrdiff_t)5120);
__m512 wt520 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c42+(ptrdiff_t)5760);
__m512 wt521 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c42+(ptrdiff_t)6400);
__m512 wt522 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c42+(ptrdiff_t)7040);
__m512 wt523 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c42+(ptrdiff_t)7680);
__m512 wt524 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c42+(ptrdiff_t)8320);
__m512 wt525 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c42+(ptrdiff_t)8960);
__m512 wt526 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k115+64*c42+(ptrdiff_t)9600);
__m512 tmp8831 = _mm512_unpacklo_ps(wt511, wt512);
__m512 tmp8832 = _mm512_unpackhi_ps(wt511, wt512);
__m512 tmp8833 = _mm512_unpacklo_ps(wt513, wt514);
__m512 tmp8834 = _mm512_unpackhi_ps(wt513, wt514);
__m512 tmp8835 = _mm512_unpacklo_ps(wt515, wt516);
__m512 tmp8836 = _mm512_unpackhi_ps(wt515, wt516);
__m512 tmp8837 = _mm512_unpacklo_ps(wt517, wt518);
__m512 tmp8838 = _mm512_unpackhi_ps(wt517, wt518);
__m512 tmp8839 = _mm512_unpacklo_ps(wt519, wt520);
__m512 tmp8840 = _mm512_unpackhi_ps(wt519, wt520);
__m512 tmp8841 = _mm512_unpacklo_ps(wt521, wt522);
__m512 tmp8842 = _mm512_unpackhi_ps(wt521, wt522);
__m512 tmp8843 = _mm512_unpacklo_ps(wt523, wt524);
__m512 tmp8844 = _mm512_unpackhi_ps(wt523, wt524);
__m512 tmp8845 = _mm512_unpacklo_ps(wt525, wt526);
__m512 tmp8846 = _mm512_unpackhi_ps(wt525, wt526);
__m512 tmp8847 = _mm512_shuffle_ps(tmp8831, tmp8833, 68);
__m512 tmp8848 = _mm512_shuffle_ps(tmp8831, tmp8833, 238);
__m512 tmp8849 = _mm512_shuffle_ps(tmp8832, tmp8834, 68);
__m512 tmp8850 = _mm512_shuffle_ps(tmp8832, tmp8834, 238);
__m512 tmp8851 = _mm512_shuffle_ps(tmp8835, tmp8837, 68);
__m512 tmp8852 = _mm512_shuffle_ps(tmp8835, tmp8837, 238);
__m512 tmp8853 = _mm512_shuffle_ps(tmp8836, tmp8838, 68);
__m512 tmp8854 = _mm512_shuffle_ps(tmp8836, tmp8838, 238);
__m512 tmp8855 = _mm512_shuffle_ps(tmp8839, tmp8841, 68);
__m512 tmp8856 = _mm512_shuffle_ps(tmp8839, tmp8841, 238);
__m512 tmp8857 = _mm512_shuffle_ps(tmp8840, tmp8842, 68);
__m512 tmp8858 = _mm512_shuffle_ps(tmp8840, tmp8842, 238);
__m512 tmp8859 = _mm512_shuffle_ps(tmp8843, tmp8845, 68);
__m512 tmp8860 = _mm512_shuffle_ps(tmp8843, tmp8845, 238);
__m512 tmp8861 = _mm512_shuffle_ps(tmp8844, tmp8846, 68);
__m512 tmp8862 = _mm512_shuffle_ps(tmp8844, tmp8846, 238);
__m512 tmp8863 = _mm512_shuffle_f32x4(tmp8847, tmp8851, 136);
__m512 tmp8864 = _mm512_shuffle_f32x4(tmp8847, tmp8851, 221);
__m512 tmp8865 = _mm512_shuffle_f32x4(tmp8848, tmp8852, 136);
__m512 tmp8866 = _mm512_shuffle_f32x4(tmp8848, tmp8852, 221);
__m512 tmp8867 = _mm512_shuffle_f32x4(tmp8849, tmp8853, 136);
__m512 tmp8868 = _mm512_shuffle_f32x4(tmp8849, tmp8853, 221);
__m512 tmp8869 = _mm512_shuffle_f32x4(tmp8850, tmp8854, 136);
__m512 tmp8870 = _mm512_shuffle_f32x4(tmp8850, tmp8854, 221);
__m512 tmp8871 = _mm512_shuffle_f32x4(tmp8855, tmp8859, 136);
__m512 tmp8872 = _mm512_shuffle_f32x4(tmp8855, tmp8859, 221);
__m512 tmp8873 = _mm512_shuffle_f32x4(tmp8856, tmp8860, 136);
__m512 tmp8874 = _mm512_shuffle_f32x4(tmp8856, tmp8860, 221);
__m512 tmp8875 = _mm512_shuffle_f32x4(tmp8857, tmp8861, 136);
__m512 tmp8876 = _mm512_shuffle_f32x4(tmp8857, tmp8861, 221);
__m512 tmp8877 = _mm512_shuffle_f32x4(tmp8858, tmp8862, 136);
__m512 tmp8878 = _mm512_shuffle_f32x4(tmp8858, tmp8862, 221);
wt511 = _mm512_shuffle_f32x4(tmp8863, tmp8871, 136);
wt519 = _mm512_shuffle_f32x4(tmp8863, tmp8871, 221);
wt512 = _mm512_shuffle_f32x4(tmp8865, tmp8873, 136);
wt520 = _mm512_shuffle_f32x4(tmp8865, tmp8873, 221);
wt513 = _mm512_shuffle_f32x4(tmp8867, tmp8875, 136);
wt521 = _mm512_shuffle_f32x4(tmp8867, tmp8875, 221);
wt514 = _mm512_shuffle_f32x4(tmp8869, tmp8877, 136);
wt522 = _mm512_shuffle_f32x4(tmp8869, tmp8877, 221);
wt515 = _mm512_shuffle_f32x4(tmp8864, tmp8872, 136);
wt523 = _mm512_shuffle_f32x4(tmp8864, tmp8872, 221);
wt516 = _mm512_shuffle_f32x4(tmp8866, tmp8874, 136);
wt524 = _mm512_shuffle_f32x4(tmp8866, tmp8874, 221);
wt517 = _mm512_shuffle_f32x4(tmp8868, tmp8876, 136);
wt525 = _mm512_shuffle_f32x4(tmp8868, tmp8876, 221);
wt518 = _mm512_shuffle_f32x4(tmp8870, tmp8878, 136);
wt526 = _mm512_shuffle_f32x4(tmp8870, tmp8878, 221);
wt511 = _mm512_mul_ps(wt511, postMul27);
wt512 = _mm512_mul_ps(wt512, postMul27);
wt513 = _mm512_mul_ps(wt513, postMul27);
wt514 = _mm512_mul_ps(wt514, postMul27);
wt515 = _mm512_mul_ps(wt515, postMul27);
wt516 = _mm512_mul_ps(wt516, postMul27);
wt517 = _mm512_mul_ps(wt517, postMul27);
wt518 = _mm512_mul_ps(wt518, postMul27);
wt519 = _mm512_mul_ps(wt519, postMul27);
wt520 = _mm512_mul_ps(wt520, postMul27);
wt521 = _mm512_mul_ps(wt521, postMul27);
wt522 = _mm512_mul_ps(wt522, postMul27);
wt523 = _mm512_mul_ps(wt523, postMul27);
wt524 = _mm512_mul_ps(wt524, postMul27);
wt525 = _mm512_mul_ps(wt525, postMul27);
wt526 = _mm512_mul_ps(wt526, postMul27);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(1+16*c42)+(ptrdiff_t)0, 63>>cut20, wt511);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(2+16*c42)+(ptrdiff_t)0, 63>>cut20, wt512);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(3+16*c42)+(ptrdiff_t)0, 63>>cut20, wt513);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(4+16*c42)+(ptrdiff_t)0, 63>>cut20, wt514);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(5+16*c42)+(ptrdiff_t)0, 63>>cut20, wt515);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(6+16*c42)+(ptrdiff_t)0, 63>>cut20, wt516);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(7+16*c42)+(ptrdiff_t)0, 63>>cut20, wt517);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(8+16*c42)+(ptrdiff_t)0, 63>>cut20, wt518);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(9+16*c42)+(ptrdiff_t)0, 63>>cut20, wt519);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(10+16*c42)+(ptrdiff_t)0, 63>>cut20, wt520);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(11+16*c42)+(ptrdiff_t)0, 63>>cut20, wt521);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(12+16*c42)+(ptrdiff_t)0, 63>>cut20, wt522);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(13+16*c42)+(ptrdiff_t)0, 63>>cut20, wt523);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(14+16*c42)+(ptrdiff_t)0, 63>>cut20, wt524);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(15+16*c42)+(ptrdiff_t)0, 63>>cut20, wt525);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(16+16*c42)+(ptrdiff_t)0, 63>>cut20, wt526);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(1+16*c42)+(ptrdiff_t)3840, 4032>>cut20, wt511);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(2+16*c42)+(ptrdiff_t)3840, 4032>>cut20, wt512);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(3+16*c42)+(ptrdiff_t)3840, 4032>>cut20, wt513);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(4+16*c42)+(ptrdiff_t)3840, 4032>>cut20, wt514);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(5+16*c42)+(ptrdiff_t)3840, 4032>>cut20, wt515);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(6+16*c42)+(ptrdiff_t)3840, 4032>>cut20, wt516);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(7+16*c42)+(ptrdiff_t)3840, 4032>>cut20, wt517);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(8+16*c42)+(ptrdiff_t)3840, 4032>>cut20, wt518);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(9+16*c42)+(ptrdiff_t)3840, 4032>>cut20, wt519);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(10+16*c42)+(ptrdiff_t)3840, 4032>>cut20, wt520);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(11+16*c42)+(ptrdiff_t)3840, 4032>>cut20, wt521);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(12+16*c42)+(ptrdiff_t)3840, 4032>>cut20, wt522);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(13+16*c42)+(ptrdiff_t)3840, 4032>>cut20, wt523);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(14+16*c42)+(ptrdiff_t)3840, 4032>>cut20, wt524);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(15+16*c42)+(ptrdiff_t)3840, 4032>>cut20, wt525);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(16+16*c42)+(ptrdiff_t)3840, 4032>>cut20, wt526);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(1+16*c42)+(ptrdiff_t)7680, 258048>>cut20, wt511);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(2+16*c42)+(ptrdiff_t)7680, 258048>>cut20, wt512);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(3+16*c42)+(ptrdiff_t)7680, 258048>>cut20, wt513);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(4+16*c42)+(ptrdiff_t)7680, 258048>>cut20, wt514);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(5+16*c42)+(ptrdiff_t)7680, 258048>>cut20, wt515);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(6+16*c42)+(ptrdiff_t)7680, 258048>>cut20, wt516);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(7+16*c42)+(ptrdiff_t)7680, 258048>>cut20, wt517);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(8+16*c42)+(ptrdiff_t)7680, 258048>>cut20, wt518);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(9+16*c42)+(ptrdiff_t)7680, 258048>>cut20, wt519);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(10+16*c42)+(ptrdiff_t)7680, 258048>>cut20, wt520);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(11+16*c42)+(ptrdiff_t)7680, 258048>>cut20, wt521);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(12+16*c42)+(ptrdiff_t)7680, 258048>>cut20, wt522);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(13+16*c42)+(ptrdiff_t)7680, 258048>>cut20, wt523);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(14+16*c42)+(ptrdiff_t)7680, 258048>>cut20, wt524);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(15+16*c42)+(ptrdiff_t)7680, 258048>>cut20, wt525);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(16+16*c42)+(ptrdiff_t)7680, 258048>>cut20, wt526);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(1+16*c42)+(ptrdiff_t)11520, 65535-(262143>>cut20), wt511);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(2+16*c42)+(ptrdiff_t)11520, 65535-(262143>>cut20), wt512);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(3+16*c42)+(ptrdiff_t)11520, 65535-(262143>>cut20), wt513);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(4+16*c42)+(ptrdiff_t)11520, 65535-(262143>>cut20), wt514);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(5+16*c42)+(ptrdiff_t)11520, 65535-(262143>>cut20), wt515);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(6+16*c42)+(ptrdiff_t)11520, 65535-(262143>>cut20), wt516);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(7+16*c42)+(ptrdiff_t)11520, 65535-(262143>>cut20), wt517);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(8+16*c42)+(ptrdiff_t)11520, 65535-(262143>>cut20), wt518);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(9+16*c42)+(ptrdiff_t)11520, 65535-(262143>>cut20), wt519);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(10+16*c42)+(ptrdiff_t)11520, 65535-(262143>>cut20), wt520);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(11+16*c42)+(ptrdiff_t)11520, 65535-(262143>>cut20), wt521);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(12+16*c42)+(ptrdiff_t)11520, 65535-(262143>>cut20), wt522);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(13+16*c42)+(ptrdiff_t)11520, 65535-(262143>>cut20), wt523);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(14+16*c42)+(ptrdiff_t)11520, 65535-(262143>>cut20), wt524);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(15+16*c42)+(ptrdiff_t)11520, 65535-(262143>>cut20), wt525);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l44+4*cut20+24*(16+16*c42)+(ptrdiff_t)11520, 65535-(262143>>cut20), wt526);
}
}
}
} else {
ptrdiff_t k114 = 112;
ptrdiff_t l43 = (size_t)(0+k114)/6;
ptrdiff_t cut19 = (size_t)(0+k114)%6;
__m512 sum358 = _mm512_maskz_loadu_ps(65535, biasPtr12+512*i51+4*k114);
__m512i pmMul25 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd25 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo24 = _mm512_loadu_ps(bnPtr17+(ptrdiff_t)8*(k114+128*i51));
__m512 masHi24 = _mm512_maskz_loadu_ps(65535, bnPtr17+(ptrdiff_t)8*(k114+128*i51)+(ptrdiff_t)64);
__m512 postMul25 = _mm512_permutex2var_ps(masLo24, pmMul25, masHi24);
__m512 postAdd23 = _mm512_permutex2var_ps(masLo24, pmAdd25, masHi24);
sum358 = _mm512_fmadd_ps(sum358, postMul25, postAdd23);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*0+(ptrdiff_t)0, 63>>cut19, sum358);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*0+(ptrdiff_t)3840, 4032>>cut19, sum358);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*0+(ptrdiff_t)7680, 258048>>cut19, sum358);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+8*0+(ptrdiff_t)11520, 65535-(262143>>cut19), sum358);
ptrdiff_t c40 = 0;
for (; c40 != 10; ++c40) {
__m512 wt479 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k114+64*c40+(ptrdiff_t)0);
__m512 wt480 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k114+64*c40+(ptrdiff_t)640);
__m512 wt481 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k114+64*c40+(ptrdiff_t)1280);
__m512 wt482 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k114+64*c40+(ptrdiff_t)1920);
__m512 wt483 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k114+64*c40+(ptrdiff_t)2560);
__m512 wt484 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k114+64*c40+(ptrdiff_t)3200);
__m512 wt485 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k114+64*c40+(ptrdiff_t)3840);
__m512 wt486 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k114+64*c40+(ptrdiff_t)4480);
__m512 wt487 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k114+64*c40+(ptrdiff_t)5120);
__m512 wt488 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k114+64*c40+(ptrdiff_t)5760);
__m512 wt489 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k114+64*c40+(ptrdiff_t)6400);
__m512 wt490 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k114+64*c40+(ptrdiff_t)7040);
__m512 wt491 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k114+64*c40+(ptrdiff_t)7680);
__m512 wt492 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k114+64*c40+(ptrdiff_t)8320);
__m512 wt493 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k114+64*c40+(ptrdiff_t)8960);
__m512 wt494 = _mm512_maskz_loadu_ps(65535, wtPtr12+81920*i51+640*k114+64*c40+(ptrdiff_t)9600);
__m512 tmp8879 = _mm512_unpacklo_ps(wt479, wt480);
__m512 tmp8880 = _mm512_unpackhi_ps(wt479, wt480);
__m512 tmp8881 = _mm512_unpacklo_ps(wt481, wt482);
__m512 tmp8882 = _mm512_unpackhi_ps(wt481, wt482);
__m512 tmp8883 = _mm512_unpacklo_ps(wt483, wt484);
__m512 tmp8884 = _mm512_unpackhi_ps(wt483, wt484);
__m512 tmp8885 = _mm512_unpacklo_ps(wt485, wt486);
__m512 tmp8886 = _mm512_unpackhi_ps(wt485, wt486);
__m512 tmp8887 = _mm512_unpacklo_ps(wt487, wt488);
__m512 tmp8888 = _mm512_unpackhi_ps(wt487, wt488);
__m512 tmp8889 = _mm512_unpacklo_ps(wt489, wt490);
__m512 tmp8890 = _mm512_unpackhi_ps(wt489, wt490);
__m512 tmp8891 = _mm512_unpacklo_ps(wt491, wt492);
__m512 tmp8892 = _mm512_unpackhi_ps(wt491, wt492);
__m512 tmp8893 = _mm512_unpacklo_ps(wt493, wt494);
__m512 tmp8894 = _mm512_unpackhi_ps(wt493, wt494);
__m512 tmp8895 = _mm512_shuffle_ps(tmp8879, tmp8881, 68);
__m512 tmp8896 = _mm512_shuffle_ps(tmp8879, tmp8881, 238);
__m512 tmp8897 = _mm512_shuffle_ps(tmp8880, tmp8882, 68);
__m512 tmp8898 = _mm512_shuffle_ps(tmp8880, tmp8882, 238);
__m512 tmp8899 = _mm512_shuffle_ps(tmp8883, tmp8885, 68);
__m512 tmp8900 = _mm512_shuffle_ps(tmp8883, tmp8885, 238);
__m512 tmp8901 = _mm512_shuffle_ps(tmp8884, tmp8886, 68);
__m512 tmp8902 = _mm512_shuffle_ps(tmp8884, tmp8886, 238);
__m512 tmp8903 = _mm512_shuffle_ps(tmp8887, tmp8889, 68);
__m512 tmp8904 = _mm512_shuffle_ps(tmp8887, tmp8889, 238);
__m512 tmp8905 = _mm512_shuffle_ps(tmp8888, tmp8890, 68);
__m512 tmp8906 = _mm512_shuffle_ps(tmp8888, tmp8890, 238);
__m512 tmp8907 = _mm512_shuffle_ps(tmp8891, tmp8893, 68);
__m512 tmp8908 = _mm512_shuffle_ps(tmp8891, tmp8893, 238);
__m512 tmp8909 = _mm512_shuffle_ps(tmp8892, tmp8894, 68);
__m512 tmp8910 = _mm512_shuffle_ps(tmp8892, tmp8894, 238);
__m512 tmp8911 = _mm512_shuffle_f32x4(tmp8895, tmp8899, 136);
__m512 tmp8912 = _mm512_shuffle_f32x4(tmp8895, tmp8899, 221);
__m512 tmp8913 = _mm512_shuffle_f32x4(tmp8896, tmp8900, 136);
__m512 tmp8914 = _mm512_shuffle_f32x4(tmp8896, tmp8900, 221);
__m512 tmp8915 = _mm512_shuffle_f32x4(tmp8897, tmp8901, 136);
__m512 tmp8916 = _mm512_shuffle_f32x4(tmp8897, tmp8901, 221);
__m512 tmp8917 = _mm512_shuffle_f32x4(tmp8898, tmp8902, 136);
__m512 tmp8918 = _mm512_shuffle_f32x4(tmp8898, tmp8902, 221);
__m512 tmp8919 = _mm512_shuffle_f32x4(tmp8903, tmp8907, 136);
__m512 tmp8920 = _mm512_shuffle_f32x4(tmp8903, tmp8907, 221);
__m512 tmp8921 = _mm512_shuffle_f32x4(tmp8904, tmp8908, 136);
__m512 tmp8922 = _mm512_shuffle_f32x4(tmp8904, tmp8908, 221);
__m512 tmp8923 = _mm512_shuffle_f32x4(tmp8905, tmp8909, 136);
__m512 tmp8924 = _mm512_shuffle_f32x4(tmp8905, tmp8909, 221);
__m512 tmp8925 = _mm512_shuffle_f32x4(tmp8906, tmp8910, 136);
__m512 tmp8926 = _mm512_shuffle_f32x4(tmp8906, tmp8910, 221);
wt479 = _mm512_shuffle_f32x4(tmp8911, tmp8919, 136);
wt487 = _mm512_shuffle_f32x4(tmp8911, tmp8919, 221);
wt480 = _mm512_shuffle_f32x4(tmp8913, tmp8921, 136);
wt488 = _mm512_shuffle_f32x4(tmp8913, tmp8921, 221);
wt481 = _mm512_shuffle_f32x4(tmp8915, tmp8923, 136);
wt489 = _mm512_shuffle_f32x4(tmp8915, tmp8923, 221);
wt482 = _mm512_shuffle_f32x4(tmp8917, tmp8925, 136);
wt490 = _mm512_shuffle_f32x4(tmp8917, tmp8925, 221);
wt483 = _mm512_shuffle_f32x4(tmp8912, tmp8920, 136);
wt491 = _mm512_shuffle_f32x4(tmp8912, tmp8920, 221);
wt484 = _mm512_shuffle_f32x4(tmp8914, tmp8922, 136);
wt492 = _mm512_shuffle_f32x4(tmp8914, tmp8922, 221);
wt485 = _mm512_shuffle_f32x4(tmp8916, tmp8924, 136);
wt493 = _mm512_shuffle_f32x4(tmp8916, tmp8924, 221);
wt486 = _mm512_shuffle_f32x4(tmp8918, tmp8926, 136);
wt494 = _mm512_shuffle_f32x4(tmp8918, tmp8926, 221);
wt479 = _mm512_mul_ps(wt479, postMul25);
wt480 = _mm512_mul_ps(wt480, postMul25);
wt481 = _mm512_mul_ps(wt481, postMul25);
wt482 = _mm512_mul_ps(wt482, postMul25);
wt483 = _mm512_mul_ps(wt483, postMul25);
wt484 = _mm512_mul_ps(wt484, postMul25);
wt485 = _mm512_mul_ps(wt485, postMul25);
wt486 = _mm512_mul_ps(wt486, postMul25);
wt487 = _mm512_mul_ps(wt487, postMul25);
wt488 = _mm512_mul_ps(wt488, postMul25);
wt489 = _mm512_mul_ps(wt489, postMul25);
wt490 = _mm512_mul_ps(wt490, postMul25);
wt491 = _mm512_mul_ps(wt491, postMul25);
wt492 = _mm512_mul_ps(wt492, postMul25);
wt493 = _mm512_mul_ps(wt493, postMul25);
wt494 = _mm512_mul_ps(wt494, postMul25);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(1+16*c40)+(ptrdiff_t)0, 63>>cut19, wt479);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(2+16*c40)+(ptrdiff_t)0, 63>>cut19, wt480);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(3+16*c40)+(ptrdiff_t)0, 63>>cut19, wt481);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(4+16*c40)+(ptrdiff_t)0, 63>>cut19, wt482);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(5+16*c40)+(ptrdiff_t)0, 63>>cut19, wt483);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(6+16*c40)+(ptrdiff_t)0, 63>>cut19, wt484);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(7+16*c40)+(ptrdiff_t)0, 63>>cut19, wt485);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(8+16*c40)+(ptrdiff_t)0, 63>>cut19, wt486);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(9+16*c40)+(ptrdiff_t)0, 63>>cut19, wt487);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(10+16*c40)+(ptrdiff_t)0, 63>>cut19, wt488);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(11+16*c40)+(ptrdiff_t)0, 63>>cut19, wt489);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(12+16*c40)+(ptrdiff_t)0, 63>>cut19, wt490);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(13+16*c40)+(ptrdiff_t)0, 63>>cut19, wt491);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(14+16*c40)+(ptrdiff_t)0, 63>>cut19, wt492);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(15+16*c40)+(ptrdiff_t)0, 63>>cut19, wt493);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(16+16*c40)+(ptrdiff_t)0, 63>>cut19, wt494);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(1+16*c40)+(ptrdiff_t)3840, 4032>>cut19, wt479);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(2+16*c40)+(ptrdiff_t)3840, 4032>>cut19, wt480);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(3+16*c40)+(ptrdiff_t)3840, 4032>>cut19, wt481);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(4+16*c40)+(ptrdiff_t)3840, 4032>>cut19, wt482);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(5+16*c40)+(ptrdiff_t)3840, 4032>>cut19, wt483);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(6+16*c40)+(ptrdiff_t)3840, 4032>>cut19, wt484);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(7+16*c40)+(ptrdiff_t)3840, 4032>>cut19, wt485);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(8+16*c40)+(ptrdiff_t)3840, 4032>>cut19, wt486);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(9+16*c40)+(ptrdiff_t)3840, 4032>>cut19, wt487);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(10+16*c40)+(ptrdiff_t)3840, 4032>>cut19, wt488);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(11+16*c40)+(ptrdiff_t)3840, 4032>>cut19, wt489);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(12+16*c40)+(ptrdiff_t)3840, 4032>>cut19, wt490);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(13+16*c40)+(ptrdiff_t)3840, 4032>>cut19, wt491);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(14+16*c40)+(ptrdiff_t)3840, 4032>>cut19, wt492);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(15+16*c40)+(ptrdiff_t)3840, 4032>>cut19, wt493);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(16+16*c40)+(ptrdiff_t)3840, 4032>>cut19, wt494);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(1+16*c40)+(ptrdiff_t)7680, 258048>>cut19, wt479);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(2+16*c40)+(ptrdiff_t)7680, 258048>>cut19, wt480);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(3+16*c40)+(ptrdiff_t)7680, 258048>>cut19, wt481);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(4+16*c40)+(ptrdiff_t)7680, 258048>>cut19, wt482);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(5+16*c40)+(ptrdiff_t)7680, 258048>>cut19, wt483);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(6+16*c40)+(ptrdiff_t)7680, 258048>>cut19, wt484);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(7+16*c40)+(ptrdiff_t)7680, 258048>>cut19, wt485);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(8+16*c40)+(ptrdiff_t)7680, 258048>>cut19, wt486);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(9+16*c40)+(ptrdiff_t)7680, 258048>>cut19, wt487);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(10+16*c40)+(ptrdiff_t)7680, 258048>>cut19, wt488);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(11+16*c40)+(ptrdiff_t)7680, 258048>>cut19, wt489);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(12+16*c40)+(ptrdiff_t)7680, 258048>>cut19, wt490);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(13+16*c40)+(ptrdiff_t)7680, 258048>>cut19, wt491);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(14+16*c40)+(ptrdiff_t)7680, 258048>>cut19, wt492);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(15+16*c40)+(ptrdiff_t)7680, 258048>>cut19, wt493);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+24*(16+16*c40)+(ptrdiff_t)7680, 258048>>cut19, wt494);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+8*(1+16*c40)+(ptrdiff_t)11520, 65535-(262143>>cut19), wt479);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+8*(2+16*c40)+(ptrdiff_t)11520, 65535-(262143>>cut19), wt480);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+8*(3+16*c40)+(ptrdiff_t)11520, 65535-(262143>>cut19), wt481);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+8*(4+16*c40)+(ptrdiff_t)11520, 65535-(262143>>cut19), wt482);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+8*(5+16*c40)+(ptrdiff_t)11520, 65535-(262143>>cut19), wt483);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+8*(6+16*c40)+(ptrdiff_t)11520, 65535-(262143>>cut19), wt484);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+8*(7+16*c40)+(ptrdiff_t)11520, 65535-(262143>>cut19), wt485);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+8*(8+16*c40)+(ptrdiff_t)11520, 65535-(262143>>cut19), wt486);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+8*(9+16*c40)+(ptrdiff_t)11520, 65535-(262143>>cut19), wt487);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+8*(10+16*c40)+(ptrdiff_t)11520, 65535-(262143>>cut19), wt488);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+8*(11+16*c40)+(ptrdiff_t)11520, 65535-(262143>>cut19), wt489);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+8*(12+16*c40)+(ptrdiff_t)11520, 65535-(262143>>cut19), wt490);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+8*(13+16*c40)+(ptrdiff_t)11520, 65535-(262143>>cut19), wt491);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+8*(14+16*c40)+(ptrdiff_t)11520, 65535-(262143>>cut19), wt492);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+8*(15+16*c40)+(ptrdiff_t)11520, 65535-(262143>>cut19), wt493);
_mm512_mask_storeu_ps(arranged17+82432*i51+3864*l43+4*cut19+8*(16+16*c40)+(ptrdiff_t)11520, 65535-(262143>>cut19), wt494);
}
}
}
}
}

static void DenseNet121OneArrangeWts9(DenseNet121ThreaderTeam1* team52, char** tensors77) {
DenseNet121ThreaderTask1 task81;
task81.callee1 = DenseNet121OneArrangeWts9Callee1;
task81.any1 = tensors77;
task81.nd1 = 3;
task81.hull1[0] = 2;
task81.hull1[1] = 1;
task81.hull1[2] = 1;
DenseNet121ThreaderDo1(team52, &task81);
}

static void DenseNet121OneArrangeDats9Callee1(DenseNet121ThreaderTask1* task82, int64_t* pt46) {
char** tensors80 = task82->any1;
ptrdiff_t c43 = pt46[1];
char*restrict datPtr23 = tensors80[0]+(ptrdiff_t)0+(ptrdiff_t)2618560*0+(ptrdiff_t)501760*0;
char*restrict bnPtr18 = tensors80[1]+(ptrdiff_t)8*((ptrdiff_t)835*0+(ptrdiff_t)160*0);
char*restrict arranged18 = tensors80[2]+(ptrdiff_t)2618560*0+(ptrdiff_t)501760*0;
ptrdiff_t ii26 = 1;
for (ptrdiff_t i52 = 0; i52 < ii26; ++i52) {
ptrdiff_t j42 = 1*c43;
ptrdiff_t jj46 = j42+0;
for (; j42 != 12; ++j42) {
ptrdiff_t k116 = 0;
ptrdiff_t kk44 = k116+160;
for (; k116 < kk44; ++k116) {
__m512 dat1600 = _mm512_maskz_loadu_ps(65535, datPtr23+501760*i52+256*j42+3136*k116+(ptrdiff_t)0);
__m512 dat1601 = _mm512_maskz_loadu_ps(65535, datPtr23+501760*i52+256*j42+3136*k116+(ptrdiff_t)64);
__m512 dat1602 = _mm512_maskz_loadu_ps(65535, datPtr23+501760*i52+256*j42+3136*k116+(ptrdiff_t)128);
__m512 dat1603 = _mm512_maskz_loadu_ps(65535, datPtr23+501760*i52+256*j42+3136*k116+(ptrdiff_t)192);
__m512 bnMul10 = _mm512_set1_ps(((float*)bnPtr18+(ptrdiff_t)2*(k116+160*i52))[0]);
__m512 bnAdd10 = _mm512_set1_ps(((float*)bnPtr18+(ptrdiff_t)2*(k116+160*i52))[1]);
dat1600 = _mm512_fmadd_ps(dat1600, bnMul10, bnAdd10);
dat1601 = _mm512_fmadd_ps(dat1601, bnMul10, bnAdd10);
dat1602 = _mm512_fmadd_ps(dat1602, bnMul10, bnAdd10);
dat1603 = _mm512_fmadd_ps(dat1603, bnMul10, bnAdd10);
dat1600 = _mm512_max_ps(_mm512_setzero_ps(), dat1600);
dat1601 = _mm512_max_ps(_mm512_setzero_ps(), dat1601);
dat1602 = _mm512_max_ps(_mm512_setzero_ps(), dat1602);
dat1603 = _mm512_max_ps(_mm512_setzero_ps(), dat1603);
_mm512_mask_storeu_ps(arranged18+501760*i52+40960*j42+256*k116+(ptrdiff_t)0, 65535, dat1600);
_mm512_mask_storeu_ps(arranged18+501760*i52+40960*j42+256*k116+(ptrdiff_t)64, 65535, dat1601);
_mm512_mask_storeu_ps(arranged18+501760*i52+40960*j42+256*k116+(ptrdiff_t)128, 65535, dat1602);
_mm512_mask_storeu_ps(arranged18+501760*i52+40960*j42+256*k116+(ptrdiff_t)192, 65535, dat1603);
}
if (j42 >= jj46) goto next9;
}
ptrdiff_t k117 = 0;
ptrdiff_t kk45 = k117+160;
for (; k117 < kk45; ++k117) {
__m512 dat1604 = _mm512_maskz_loadu_ps(65535, datPtr23+501760*i52+256*j42+3136*k117+(ptrdiff_t)0);
__m512 bnMul11 = _mm512_set1_ps(((float*)bnPtr18+(ptrdiff_t)2*(k117+160*i52))[0]);
__m512 bnAdd11 = _mm512_set1_ps(((float*)bnPtr18+(ptrdiff_t)2*(k117+160*i52))[1]);
dat1604 = _mm512_fmadd_ps(dat1604, bnMul11, bnAdd11);
dat1604 = _mm512_max_ps(_mm512_setzero_ps(), dat1604);
_mm512_mask_storeu_ps(arranged18+501760*i52+40960*j42+64*k117+(ptrdiff_t)0, 65535, dat1604);
}
next9:;
}
}

static void DenseNet121OneArrangeDats9(DenseNet121ThreaderTeam1* team53, char** tensors79) {
DenseNet121ThreaderTask1 task83;
task83.callee1 = DenseNet121OneArrangeDats9Callee1;
task83.any1 = tensors79;
task83.nd1 = 4;
task83.hull1[0] = 1;
task83.hull1[1] = 13;
task83.hull1[2] = 1;
task83.hull1[3] = 1;
DenseNet121ThreaderDo1(team53, &task83);
}

static void DenseNet121OneApply9Callee1(DenseNet121ThreaderTask1* task84, int64_t* pt47) {
void** pair22 = task84->any1;
char** tensors82 = pair22[0];
ptrdiff_t e20 = 0;
ptrdiff_t g22 = 0;
ptrdiff_t d15 = pt47[1];
ptrdiff_t w48 = pt47[0];
char*restrict arrangedWts9 = tensors82[0]+428032*e20+(ptrdiff_t)82432*1*g22;
char*restrict arrangedDats9 = tensors82[1]+2618560*e20+(ptrdiff_t)501760*1*g22;
char*restrict datPtr24 = tensors82[2]+(ptrdiff_t)401408*1*g22;
ptrdiff_t ii27 = 1;
for (ptrdiff_t i53 = 0; i53 < ii27; ++i53) {
ptrdiff_t j43 = 1*d15;
ptrdiff_t jj47 = j43+0;
for (; j43 != 12; ++j43) {
ptrdiff_t k118 = 4*w48;
ptrdiff_t kk46 = k118+(w48 < 4 ? 3 : 5);
for (; k118 != 21; ++k118) {
ptrdiff_t s33 = -1;
__m512 sum361 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k118+24*s33+(ptrdiff_t)24));
__m512 sum365 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k118+24*s33+(ptrdiff_t)28));
__m512 sum369 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k118+24*s33+(ptrdiff_t)32));
__m512 sum373 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k118+24*s33+(ptrdiff_t)36));
__m512 sum377 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k118+24*s33+(ptrdiff_t)40));
__m512 sum381 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k118+24*s33+(ptrdiff_t)44));
__m512 sum362 = sum361;
__m512 sum363 = sum361;
__m512 sum364 = sum361;
__m512 sum366 = sum365;
__m512 sum367 = sum365;
__m512 sum368 = sum365;
__m512 sum370 = sum369;
__m512 sum371 = sum369;
__m512 sum372 = sum369;
__m512 sum374 = sum373;
__m512 sum375 = sum373;
__m512 sum376 = sum373;
__m512 sum378 = sum377;
__m512 sum379 = sum377;
__m512 sum380 = sum377;
__m512 sum382 = sum381;
__m512 sum383 = sum381;
__m512 sum384 = sum381;
for (s33 = 0; s33 < 160; ++s33) {
__m512 dat1605 = _mm512_loadu_ps(arrangedDats9+501760*i53+40960*j43+256*s33+(ptrdiff_t)0);
__m512 dat1606 = _mm512_loadu_ps(arrangedDats9+501760*i53+40960*j43+256*s33+(ptrdiff_t)64);
__m512 dat1607 = _mm512_loadu_ps(arrangedDats9+501760*i53+40960*j43+256*s33+(ptrdiff_t)128);
__m512 dat1608 = _mm512_loadu_ps(arrangedDats9+501760*i53+40960*j43+256*s33+(ptrdiff_t)192);
__m512 wt527 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k118+24*s33+(ptrdiff_t)24));
sum361 = _mm512_fmadd_ps(wt527, dat1605, sum361);
sum362 = _mm512_fmadd_ps(wt527, dat1606, sum362);
sum363 = _mm512_fmadd_ps(wt527, dat1607, sum363);
sum364 = _mm512_fmadd_ps(wt527, dat1608, sum364);
__m512 wt528 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k118+24*s33+(ptrdiff_t)28));
sum365 = _mm512_fmadd_ps(wt528, dat1605, sum365);
sum366 = _mm512_fmadd_ps(wt528, dat1606, sum366);
sum367 = _mm512_fmadd_ps(wt528, dat1607, sum367);
sum368 = _mm512_fmadd_ps(wt528, dat1608, sum368);
__m512 wt529 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k118+24*s33+(ptrdiff_t)32));
sum369 = _mm512_fmadd_ps(wt529, dat1605, sum369);
sum370 = _mm512_fmadd_ps(wt529, dat1606, sum370);
sum371 = _mm512_fmadd_ps(wt529, dat1607, sum371);
sum372 = _mm512_fmadd_ps(wt529, dat1608, sum372);
__m512 wt530 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k118+24*s33+(ptrdiff_t)36));
sum373 = _mm512_fmadd_ps(wt530, dat1605, sum373);
sum374 = _mm512_fmadd_ps(wt530, dat1606, sum374);
sum375 = _mm512_fmadd_ps(wt530, dat1607, sum375);
sum376 = _mm512_fmadd_ps(wt530, dat1608, sum376);
__m512 wt531 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k118+24*s33+(ptrdiff_t)40));
sum377 = _mm512_fmadd_ps(wt531, dat1605, sum377);
sum378 = _mm512_fmadd_ps(wt531, dat1606, sum378);
sum379 = _mm512_fmadd_ps(wt531, dat1607, sum379);
sum380 = _mm512_fmadd_ps(wt531, dat1608, sum380);
__m512 wt532 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k118+24*s33+(ptrdiff_t)44));
sum381 = _mm512_fmadd_ps(wt532, dat1605, sum381);
sum382 = _mm512_fmadd_ps(wt532, dat1606, sum382);
sum383 = _mm512_fmadd_ps(wt532, dat1607, sum383);
sum384 = _mm512_fmadd_ps(wt532, dat1608, sum384);
}
sum361 = _mm512_max_ps(_mm512_setzero_ps(), sum361);
sum362 = _mm512_max_ps(_mm512_setzero_ps(), sum362);
sum363 = _mm512_max_ps(_mm512_setzero_ps(), sum363);
sum364 = _mm512_max_ps(_mm512_setzero_ps(), sum364);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)0, 65535, sum361);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)64, 65535, sum362);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)128, 65535, sum363);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)192, 65535, sum364);
sum365 = _mm512_max_ps(_mm512_setzero_ps(), sum365);
sum366 = _mm512_max_ps(_mm512_setzero_ps(), sum366);
sum367 = _mm512_max_ps(_mm512_setzero_ps(), sum367);
sum368 = _mm512_max_ps(_mm512_setzero_ps(), sum368);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)3136, 65535, sum365);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)3200, 65535, sum366);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)3264, 65535, sum367);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)3328, 65535, sum368);
sum369 = _mm512_max_ps(_mm512_setzero_ps(), sum369);
sum370 = _mm512_max_ps(_mm512_setzero_ps(), sum370);
sum371 = _mm512_max_ps(_mm512_setzero_ps(), sum371);
sum372 = _mm512_max_ps(_mm512_setzero_ps(), sum372);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)6272, 65535, sum369);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)6336, 65535, sum370);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)6400, 65535, sum371);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)6464, 65535, sum372);
sum373 = _mm512_max_ps(_mm512_setzero_ps(), sum373);
sum374 = _mm512_max_ps(_mm512_setzero_ps(), sum374);
sum375 = _mm512_max_ps(_mm512_setzero_ps(), sum375);
sum376 = _mm512_max_ps(_mm512_setzero_ps(), sum376);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)9408, 65535, sum373);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)9472, 65535, sum374);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)9536, 65535, sum375);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)9600, 65535, sum376);
sum377 = _mm512_max_ps(_mm512_setzero_ps(), sum377);
sum378 = _mm512_max_ps(_mm512_setzero_ps(), sum378);
sum379 = _mm512_max_ps(_mm512_setzero_ps(), sum379);
sum380 = _mm512_max_ps(_mm512_setzero_ps(), sum380);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)12544, 65535, sum377);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)12608, 65535, sum378);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)12672, 65535, sum379);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)12736, 65535, sum380);
sum381 = _mm512_max_ps(_mm512_setzero_ps(), sum381);
sum382 = _mm512_max_ps(_mm512_setzero_ps(), sum382);
sum383 = _mm512_max_ps(_mm512_setzero_ps(), sum383);
sum384 = _mm512_max_ps(_mm512_setzero_ps(), sum384);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)15680, 65535, sum381);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)15744, 65535, sum382);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)15808, 65535, sum383);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)15872, 65535, sum384);
if (k118 >= kk46) return;
}
ptrdiff_t s34 = -1;
__m512 sum385 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k118+8*s34+(ptrdiff_t)8));
__m512 sum389 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k118+8*s34+(ptrdiff_t)12));
__m512 sum386 = sum385;
__m512 sum387 = sum385;
__m512 sum388 = sum385;
__m512 sum390 = sum389;
__m512 sum391 = sum389;
__m512 sum392 = sum389;
for (s34 = 0; s34 < 160; ++s34) {
__m512 dat1609 = _mm512_loadu_ps(arrangedDats9+501760*i53+40960*j43+256*s34+(ptrdiff_t)0);
__m512 dat1610 = _mm512_loadu_ps(arrangedDats9+501760*i53+40960*j43+256*s34+(ptrdiff_t)64);
__m512 dat1611 = _mm512_loadu_ps(arrangedDats9+501760*i53+40960*j43+256*s34+(ptrdiff_t)128);
__m512 dat1612 = _mm512_loadu_ps(arrangedDats9+501760*i53+40960*j43+256*s34+(ptrdiff_t)192);
__m512 wt533 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k118+8*s34+(ptrdiff_t)8));
sum385 = _mm512_fmadd_ps(wt533, dat1609, sum385);
sum386 = _mm512_fmadd_ps(wt533, dat1610, sum386);
sum387 = _mm512_fmadd_ps(wt533, dat1611, sum387);
sum388 = _mm512_fmadd_ps(wt533, dat1612, sum388);
__m512 wt534 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k118+8*s34+(ptrdiff_t)12));
sum389 = _mm512_fmadd_ps(wt534, dat1609, sum389);
sum390 = _mm512_fmadd_ps(wt534, dat1610, sum390);
sum391 = _mm512_fmadd_ps(wt534, dat1611, sum391);
sum392 = _mm512_fmadd_ps(wt534, dat1612, sum392);
}
sum385 = _mm512_max_ps(_mm512_setzero_ps(), sum385);
sum386 = _mm512_max_ps(_mm512_setzero_ps(), sum386);
sum387 = _mm512_max_ps(_mm512_setzero_ps(), sum387);
sum388 = _mm512_max_ps(_mm512_setzero_ps(), sum388);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)0, 65535, sum385);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)64, 65535, sum386);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)128, 65535, sum387);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)192, 65535, sum388);
sum389 = _mm512_max_ps(_mm512_setzero_ps(), sum389);
sum390 = _mm512_max_ps(_mm512_setzero_ps(), sum390);
sum391 = _mm512_max_ps(_mm512_setzero_ps(), sum391);
sum392 = _mm512_max_ps(_mm512_setzero_ps(), sum392);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)3136, 65535, sum389);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)3200, 65535, sum390);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)3264, 65535, sum391);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k118+(ptrdiff_t)3328, 65535, sum392);
if (j43 >= jj47) return;
}
ptrdiff_t k119 = 4*w48;
ptrdiff_t kk47 = k119+(w48 < 4 ? 3 : 5);
for (; k119 != 21; ++k119) {
ptrdiff_t s35 = -1;
__m512 sum393 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k119+24*s35+(ptrdiff_t)24));
__m512 sum394 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k119+24*s35+(ptrdiff_t)28));
__m512 sum395 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k119+24*s35+(ptrdiff_t)32));
__m512 sum396 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k119+24*s35+(ptrdiff_t)36));
__m512 sum397 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k119+24*s35+(ptrdiff_t)40));
__m512 sum398 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k119+24*s35+(ptrdiff_t)44));
for (s35 = 0; s35 < 160; ++s35) {
__m512 dat1613 = _mm512_loadu_ps(arrangedDats9+501760*i53+40960*j43+64*s35+(ptrdiff_t)0);
__m512 wt535 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k119+24*s35+(ptrdiff_t)24));
sum393 = _mm512_fmadd_ps(wt535, dat1613, sum393);
__m512 wt536 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k119+24*s35+(ptrdiff_t)28));
sum394 = _mm512_fmadd_ps(wt536, dat1613, sum394);
__m512 wt537 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k119+24*s35+(ptrdiff_t)32));
sum395 = _mm512_fmadd_ps(wt537, dat1613, sum395);
__m512 wt538 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k119+24*s35+(ptrdiff_t)36));
sum396 = _mm512_fmadd_ps(wt538, dat1613, sum396);
__m512 wt539 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k119+24*s35+(ptrdiff_t)40));
sum397 = _mm512_fmadd_ps(wt539, dat1613, sum397);
__m512 wt540 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k119+24*s35+(ptrdiff_t)44));
sum398 = _mm512_fmadd_ps(wt540, dat1613, sum398);
}
sum393 = _mm512_max_ps(_mm512_setzero_ps(), sum393);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k119+(ptrdiff_t)0, 65535, sum393);
sum394 = _mm512_max_ps(_mm512_setzero_ps(), sum394);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k119+(ptrdiff_t)3136, 65535, sum394);
sum395 = _mm512_max_ps(_mm512_setzero_ps(), sum395);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k119+(ptrdiff_t)6272, 65535, sum395);
sum396 = _mm512_max_ps(_mm512_setzero_ps(), sum396);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k119+(ptrdiff_t)9408, 65535, sum396);
sum397 = _mm512_max_ps(_mm512_setzero_ps(), sum397);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k119+(ptrdiff_t)12544, 65535, sum397);
sum398 = _mm512_max_ps(_mm512_setzero_ps(), sum398);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k119+(ptrdiff_t)15680, 65535, sum398);
if (k119 >= kk47) return;
}
ptrdiff_t s36 = -1;
__m512 sum399 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k119+8*s36+(ptrdiff_t)8));
__m512 sum400 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k119+8*s36+(ptrdiff_t)12));
for (s36 = 0; s36 < 160; ++s36) {
__m512 dat1614 = _mm512_loadu_ps(arrangedDats9+501760*i53+40960*j43+64*s36+(ptrdiff_t)0);
__m512 wt541 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k119+8*s36+(ptrdiff_t)8));
sum399 = _mm512_fmadd_ps(wt541, dat1614, sum399);
__m512 wt542 = _mm512_set1_ps(*(float*)(arrangedWts9+82432*i53+3864*k119+8*s36+(ptrdiff_t)12));
sum400 = _mm512_fmadd_ps(wt542, dat1614, sum400);
}
sum399 = _mm512_max_ps(_mm512_setzero_ps(), sum399);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k119+(ptrdiff_t)0, 65535, sum399);
sum400 = _mm512_max_ps(_mm512_setzero_ps(), sum400);
_mm512_mask_storeu_ps(datPtr24+401408*i53+256*j43+18816*k119+(ptrdiff_t)3136, 65535, sum400);
}
}

static void DenseNet121OneApply9(DenseNet121ThreaderTeam1* team54, char** tensors81) {
void* pair21[] = {tensors81, 0};
DenseNet121ThreaderTask1 task85;
task85.callee1 = DenseNet121OneApply9Callee1;
task85.any1 = pair21;
task85.nd1 = 3;
task85.hull1[0] = 5;
task85.hull1[1] = 13;
task85.hull1[2] = 1;
DenseNet121ThreaderDo1(team54, &task85);
}

static void DenseNet121OneArrangeWts10Callee1(DenseNet121ThreaderTask1* task86, int64_t* pt48) {
char** tensors84 = task86->any1;
ptrdiff_t b58 = pt48[0];
char*restrict wtPtr13 = tensors84[0]+(ptrdiff_t)3340*0+(ptrdiff_t)98304*0;
char*restrict biasPtr13 = tensors84[1]+(ptrdiff_t)512*0;
char*restrict bnPtr19 = tensors84[2]+(ptrdiff_t)8*128*0;
char*restrict arranged19 = tensors84[3]+(ptrdiff_t)428032*0+(ptrdiff_t)98816*0;
ptrdiff_t ii28 = 1;
for (ptrdiff_t i54 = 0; i54 < ii28; ++i54) {
ptrdiff_t j44 = 4*b58;
ptrdiff_t jj48 = j44+4;
for (; j44 < jj48; ++j44) {
if (j44 < 7) {
ptrdiff_t k121 = 0+16*(j44-0);
ptrdiff_t l46 = (size_t)(0+k121)/6;
ptrdiff_t cut22 = (size_t)(0+k121)%6;
switch (cut22) {
case 0:;
case 2: {
__m512 sum402 = _mm512_maskz_loadu_ps(65535, biasPtr13+512*i54+4*k121);
__m512i pmMul26 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd26 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo25 = _mm512_loadu_ps(bnPtr19+(ptrdiff_t)8*(k121+128*i54));
__m512 masHi25 = _mm512_maskz_loadu_ps(65535, bnPtr19+(ptrdiff_t)8*(k121+128*i54)+(ptrdiff_t)64);
__m512 postMul29 = _mm512_permutex2var_ps(masLo25, pmMul26, masHi25);
__m512 postAdd27 = _mm512_permutex2var_ps(masLo25, pmAdd26, masHi25);
sum402 = _mm512_fmadd_ps(sum402, postMul29, postAdd27);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*0+(ptrdiff_t)0, 63>>cut22, sum402);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*0+(ptrdiff_t)4608, 4032>>cut22, sum402);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*0+(ptrdiff_t)9216, 65535-(4095>>cut22), sum402);
ptrdiff_t c45 = 0;
for (; c45 != 12; ++c45) {
__m512 wt559 = _mm512_maskz_loadu_ps(65535, wtPtr13+98304*i54+768*k121+64*c45+(ptrdiff_t)0);
__m512 wt560 = _mm512_maskz_loadu_ps(65535, wtPtr13+98304*i54+768*k121+64*c45+(ptrdiff_t)768);
__m512 wt561 = _mm512_maskz_loadu_ps(65535, wtPtr13+98304*i54+768*k121+64*c45+(ptrdiff_t)1536);
__m512 wt562 = _mm512_maskz_loadu_ps(65535, wtPtr13+98304*i54+768*k121+64*c45+(ptrdiff_t)2304);
__m512 wt563 = _mm512_maskz_loadu_ps(65535, wtPtr13+98304*i54+768*k121+64*c45+(ptrdiff_t)3072);
__m512 wt564 = _mm512_maskz_loadu_ps(65535, wtPtr13+98304*i54+768*k121+64*c45+(ptrdiff_t)3840);
__m512 wt565 = _mm512_maskz_loadu_ps(65535, wtPtr13+98304*i54+768*k121+64*c45+(ptrdiff_t)4608);
__m512 wt566 = _mm512_maskz_loadu_ps(65535, wtPtr13+98304*i54+768*k121+64*c45+(ptrdiff_t)5376);
__m512 wt567 = _mm512_maskz_loadu_ps(65535, wtPtr13+98304*i54+768*k121+64*c45+(ptrdiff_t)6144);
__m512 wt568 = _mm512_maskz_loadu_ps(65535, wtPtr13+98304*i54+768*k121+64*c45+(ptrdiff_t)6912);
__m512 wt569 = _mm512_maskz_loadu_ps(65535, wtPtr13+98304*i54+768*k121+64*c45+(ptrdiff_t)7680);
__m512 wt570 = _mm512_maskz_loadu_ps(65535, wtPtr13+98304*i54+768*k121+64*c45+(ptrdiff_t)8448);
__m512 wt571 = _mm512_maskz_loadu_ps(65535, wtPtr13+98304*i54+768*k121+64*c45+(ptrdiff_t)9216);
__m512 wt572 = _mm512_maskz_loadu_ps(65535, wtPtr13+98304*i54+768*k121+64*c45+(ptrdiff_t)9984);
__m512 wt573 = _mm512_maskz_loadu_ps(65535, wtPtr13+98304*i54+768*k121+64*c45+(ptrdiff_t)10752);
__m512 wt574 = _mm512_maskz_loadu_ps(65535, wtPtr13+98304*i54+768*k121+64*c45+(ptrdiff_t)11520);
__m512 tmp8927 = _mm512_unpacklo_ps(wt559, wt560);
__m512 tmp8928 = _mm512_unpackhi_ps(wt559, wt560);
__m512 tmp8929 = _mm512_unpacklo_ps(wt561, wt562);
__m512 tmp8930 = _mm512_unpackhi_ps(wt561, wt562);
__m512 tmp8931 = _mm512_unpacklo_ps(wt563, wt564);
__m512 tmp8932 = _mm512_unpackhi_ps(wt563, wt564);
__m512 tmp8933 = _mm512_unpacklo_ps(wt565, wt566);
__m512 tmp8934 = _mm512_unpackhi_ps(wt565, wt566);
__m512 tmp8935 = _mm512_unpacklo_ps(wt567, wt568);
__m512 tmp8936 = _mm512_unpackhi_ps(wt567, wt568);
__m512 tmp8937 = _mm512_unpacklo_ps(wt569, wt570);
__m512 tmp8938 = _mm512_unpackhi_ps(wt569, wt570);
__m512 tmp8939 = _mm512_unpacklo_ps(wt571, wt572);
__m512 tmp8940 = _mm512_unpackhi_ps(wt571, wt572);
__m512 tmp8941 = _mm512_unpacklo_ps(wt573, wt574);
__m512 tmp8942 = _mm512_unpackhi_ps(wt573, wt574);
__m512 tmp8943 = _mm512_shuffle_ps(tmp8927, tmp8929, 68);
__m512 tmp8944 = _mm512_shuffle_ps(tmp8927, tmp8929, 238);
__m512 tmp8945 = _mm512_shuffle_ps(tmp8928, tmp8930, 68);
__m512 tmp8946 = _mm512_shuffle_ps(tmp8928, tmp8930, 238);
__m512 tmp8947 = _mm512_shuffle_ps(tmp8931, tmp8933, 68);
__m512 tmp8948 = _mm512_shuffle_ps(tmp8931, tmp8933, 238);
__m512 tmp8949 = _mm512_shuffle_ps(tmp8932, tmp8934, 68);
__m512 tmp8950 = _mm512_shuffle_ps(tmp8932, tmp8934, 238);
__m512 tmp8951 = _mm512_shuffle_ps(tmp8935, tmp8937, 68);
__m512 tmp8952 = _mm512_shuffle_ps(tmp8935, tmp8937, 238);
__m512 tmp8953 = _mm512_shuffle_ps(tmp8936, tmp8938, 68);
__m512 tmp8954 = _mm512_shuffle_ps(tmp8936, tmp8938, 238);
__m512 tmp8955 = _mm512_shuffle_ps(tmp8939, tmp8941, 68);
__m512 tmp8956 = _mm512_shuffle_ps(tmp8939, tmp8941, 238);
__m512 tmp8957 = _mm512_shuffle_ps(tmp8940, tmp8942, 68);
__m512 tmp8958 = _mm512_shuffle_ps(tmp8940, tmp8942, 238);
__m512 tmp8959 = _mm512_shuffle_f32x4(tmp8943, tmp8947, 136);
__m512 tmp8960 = _mm512_shuffle_f32x4(tmp8943, tmp8947, 221);
__m512 tmp8961 = _mm512_shuffle_f32x4(tmp8944, tmp8948, 136);
__m512 tmp8962 = _mm512_shuffle_f32x4(tmp8944, tmp8948, 221);
__m512 tmp8963 = _mm512_shuffle_f32x4(tmp8945, tmp8949, 136);
__m512 tmp8964 = _mm512_shuffle_f32x4(tmp8945, tmp8949, 221);
__m512 tmp8965 = _mm512_shuffle_f32x4(tmp8946, tmp8950, 136);
__m512 tmp8966 = _mm512_shuffle_f32x4(tmp8946, tmp8950, 221);
__m512 tmp8967 = _mm512_shuffle_f32x4(tmp8951, tmp8955, 136);
__m512 tmp8968 = _mm512_shuffle_f32x4(tmp8951, tmp8955, 221);
__m512 tmp8969 = _mm512_shuffle_f32x4(tmp8952, tmp8956, 136);
__m512 tmp8970 = _mm512_shuffle_f32x4(tmp8952, tmp8956, 221);
__m512 tmp8971 = _mm512_shuffle_f32x4(tmp8953, tmp8957, 136);
__m512 tmp8972 = _mm512_shuffle_f32x4(tmp8953, tmp8957, 221);
__m512 tmp8973 = _mm512_shuffle_f32x4(tmp8954, tmp8958, 136);
__m512 tmp8974 = _mm512_shuffle_f32x4(tmp8954, tmp8958, 221);
wt559 = _mm512_shuffle_f32x4(tmp8959, tmp8967, 136);
wt567 = _mm512_shuffle_f32x4(tmp8959, tmp8967, 221);
wt560 = _mm512_shuffle_f32x4(tmp8961, tmp8969, 136);
wt568 = _mm512_shuffle_f32x4(tmp8961, tmp8969, 221);
wt561 = _mm512_shuffle_f32x4(tmp8963, tmp8971, 136);
wt569 = _mm512_shuffle_f32x4(tmp8963, tmp8971, 221);
wt562 = _mm512_shuffle_f32x4(tmp8965, tmp8973, 136);
wt570 = _mm512_shuffle_f32x4(tmp8965, tmp8973, 221);
wt563 = _mm512_shuffle_f32x4(tmp8960, tmp8968, 136);
wt571 = _mm512_shuffle_f32x4(tmp8960, tmp8968, 221);
wt564 = _mm512_shuffle_f32x4(tmp8962, tmp8970, 136);
wt572 = _mm512_shuffle_f32x4(tmp8962, tmp8970, 221);
wt565 = _mm512_shuffle_f32x4(tmp8964, tmp8972, 136);
wt573 = _mm512_shuffle_f32x4(tmp8964, tmp8972, 221);
wt566 = _mm512_shuffle_f32x4(tmp8966, tmp8974, 136);
wt574 = _mm512_shuffle_f32x4(tmp8966, tmp8974, 221);
wt559 = _mm512_mul_ps(wt559, postMul29);
wt560 = _mm512_mul_ps(wt560, postMul29);
wt561 = _mm512_mul_ps(wt561, postMul29);
wt562 = _mm512_mul_ps(wt562, postMul29);
wt563 = _mm512_mul_ps(wt563, postMul29);
wt564 = _mm512_mul_ps(wt564, postMul29);
wt565 = _mm512_mul_ps(wt565, postMul29);
wt566 = _mm512_mul_ps(wt566, postMul29);
wt567 = _mm512_mul_ps(wt567, postMul29);
wt568 = _mm512_mul_ps(wt568, postMul29);
wt569 = _mm512_mul_ps(wt569, postMul29);
wt570 = _mm512_mul_ps(wt570, postMul29);
wt571 = _mm512_mul_ps(wt571, postMul29);
wt572 = _mm512_mul_ps(wt572, postMul29);
wt573 = _mm512_mul_ps(wt573, postMul29);
wt574 = _mm512_mul_ps(wt574, postMul29);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(1+16*c45)+(ptrdiff_t)0, 63>>cut22, wt559);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(2+16*c45)+(ptrdiff_t)0, 63>>cut22, wt560);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(3+16*c45)+(ptrdiff_t)0, 63>>cut22, wt561);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(4+16*c45)+(ptrdiff_t)0, 63>>cut22, wt562);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(5+16*c45)+(ptrdiff_t)0, 63>>cut22, wt563);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(6+16*c45)+(ptrdiff_t)0, 63>>cut22, wt564);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(7+16*c45)+(ptrdiff_t)0, 63>>cut22, wt565);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(8+16*c45)+(ptrdiff_t)0, 63>>cut22, wt566);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(9+16*c45)+(ptrdiff_t)0, 63>>cut22, wt567);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(10+16*c45)+(ptrdiff_t)0, 63>>cut22, wt568);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(11+16*c45)+(ptrdiff_t)0, 63>>cut22, wt569);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(12+16*c45)+(ptrdiff_t)0, 63>>cut22, wt570);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(13+16*c45)+(ptrdiff_t)0, 63>>cut22, wt571);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(14+16*c45)+(ptrdiff_t)0, 63>>cut22, wt572);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(15+16*c45)+(ptrdiff_t)0, 63>>cut22, wt573);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(16+16*c45)+(ptrdiff_t)0, 63>>cut22, wt574);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(1+16*c45)+(ptrdiff_t)4608, 4032>>cut22, wt559);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(2+16*c45)+(ptrdiff_t)4608, 4032>>cut22, wt560);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(3+16*c45)+(ptrdiff_t)4608, 4032>>cut22, wt561);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(4+16*c45)+(ptrdiff_t)4608, 4032>>cut22, wt562);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(5+16*c45)+(ptrdiff_t)4608, 4032>>cut22, wt563);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(6+16*c45)+(ptrdiff_t)4608, 4032>>cut22, wt564);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(7+16*c45)+(ptrdiff_t)4608, 4032>>cut22, wt565);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(8+16*c45)+(ptrdiff_t)4608, 4032>>cut22, wt566);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(9+16*c45)+(ptrdiff_t)4608, 4032>>cut22, wt567);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(10+16*c45)+(ptrdiff_t)4608, 4032>>cut22, wt568);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(11+16*c45)+(ptrdiff_t)4608, 4032>>cut22, wt569);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(12+16*c45)+(ptrdiff_t)4608, 4032>>cut22, wt570);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(13+16*c45)+(ptrdiff_t)4608, 4032>>cut22, wt571);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(14+16*c45)+(ptrdiff_t)4608, 4032>>cut22, wt572);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(15+16*c45)+(ptrdiff_t)4608, 4032>>cut22, wt573);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(16+16*c45)+(ptrdiff_t)4608, 4032>>cut22, wt574);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(1+16*c45)+(ptrdiff_t)9216, 65535-(4095>>cut22), wt559);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(2+16*c45)+(ptrdiff_t)9216, 65535-(4095>>cut22), wt560);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(3+16*c45)+(ptrdiff_t)9216, 65535-(4095>>cut22), wt561);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(4+16*c45)+(ptrdiff_t)9216, 65535-(4095>>cut22), wt562);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(5+16*c45)+(ptrdiff_t)9216, 65535-(4095>>cut22), wt563);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(6+16*c45)+(ptrdiff_t)9216, 65535-(4095>>cut22), wt564);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(7+16*c45)+(ptrdiff_t)9216, 65535-(4095>>cut22), wt565);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(8+16*c45)+(ptrdiff_t)9216, 65535-(4095>>cut22), wt566);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(9+16*c45)+(ptrdiff_t)9216, 65535-(4095>>cut22), wt567);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(10+16*c45)+(ptrdiff_t)9216, 65535-(4095>>cut22), wt568);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(11+16*c45)+(ptrdiff_t)9216, 65535-(4095>>cut22), wt569);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(12+16*c45)+(ptrdiff_t)9216, 65535-(4095>>cut22), wt570);
_mm512_mask_storeu_ps(arranged19+98816*i54+4632*l46+4*cut22+24*(13+16*c45)+(ptrdif